mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-01 08:14:10 +08:00
merge
This commit is contained in:
commit
d90a2dac5e
@ -200,6 +200,10 @@
|
||||
#if defined __CUDACC__
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
@ -329,7 +333,9 @@ using std::ptrdiff_t;
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||
#include "src/Core/arch/CUDA/PacketMathHalf.h"
|
||||
#include "src/Core/arch/CUDA/MathFunctions.h"
|
||||
#include "src/Core/arch/CUDA/TypeCasting.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/arch/Default/Settings.h"
|
||||
|
@ -182,10 +182,10 @@ struct digamma_impl_maybe_poly<float> {
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE float run(const float s) {
|
||||
const float A[] = {
|
||||
-4.16666666666666666667E-3,
|
||||
3.96825396825396825397E-3,
|
||||
-8.33333333333333333333E-3,
|
||||
8.33333333333333333333E-2
|
||||
-4.16666666666666666667E-3f,
|
||||
3.96825396825396825397E-3f,
|
||||
-8.33333333333333333333E-3f,
|
||||
8.33333333333333333333E-2f
|
||||
};
|
||||
|
||||
float z;
|
||||
|
@ -267,31 +267,34 @@ pexp<Packet8f>(const Packet8f& _x) {
|
||||
|
||||
// Hyperbolic Tangent function.
|
||||
// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
|
||||
// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
|
||||
// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
|
||||
// fl(tanh(x)) = +/-1.
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
|
||||
ptanh<Packet8f>(const Packet8f& _x) {
|
||||
// Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
|
||||
const Packet8f x = _mm256_mul_ps(_x, _mm256_set1_ps(0.125f));
|
||||
// Clamp the inputs to the range [-9, 9] since anything outside
|
||||
// this range is +/-1.0f in single-precision.
|
||||
_EIGEN_DECLARE_CONST_Packet8f(plus_9, 9.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(minus_9, -9.0f);
|
||||
const Packet8f x = pmax(p8f_minus_9, pmin(p8f_plus_9, _x));
|
||||
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_1, -2.47030171958948e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_3, -2.06804010015822e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_5, -3.13693994587418e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_7, -7.19851201683627e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_9, 8.31561269687160e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_11, -1.37626659546502e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_13, 1.39116714700458e-05f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_1, 4.89352455891786e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_3, 6.37261928875436e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_5, 1.48572235717979e-05f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_7, 5.12229709037114e-08f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_9, -8.60467152213735e-11f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_11, 2.00018790482477e-13f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(alpha_13, -2.76076847742355e-16f);
|
||||
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_0, -3.08787724141615e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_2, -9.17251911622436e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_4, -3.09625062090444e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_6, -2.05669680763032e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_0, 4.89352518554385e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_2, 2.26843463243900e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_4, 1.18534705686654e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(beta_6, 1.19825839466702e-06f);
|
||||
|
||||
// Since the polynomials are odd/even, we need x^2.
|
||||
const Packet8f x2 = _mm256_mul_ps(x, x);
|
||||
const Packet8f x2 = pmul(x, x);
|
||||
|
||||
// Evaluate the numerator polynomial p.
|
||||
Packet8f p = pmadd(x2, p8f_alpha_13, p8f_alpha_11);
|
||||
@ -308,14 +311,7 @@ ptanh<Packet8f>(const Packet8f& _x) {
|
||||
q = pmadd(x2, q, p8f_beta_0);
|
||||
|
||||
// Divide the numerator by the denominator.
|
||||
const Packet8f res = pdiv(p, q);
|
||||
|
||||
// Mask-out values outside of [-8, 8].
|
||||
_EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet8f(minus_one, -1.0f);
|
||||
return _mm256_blendv_ps(
|
||||
_mm256_blendv_ps(res, p8f_one, _mm256_cmp_ps(x, p8f_one, _CMP_GT_OQ)),
|
||||
p8f_minus_one, _mm256_cmp_ps(x, p8f_minus_one, _CMP_LT_OQ));
|
||||
return pdiv(p, q);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -21,7 +21,6 @@ namespace internal {
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
|
||||
template<> struct packet_traits<float> : default_packet_traits
|
||||
{
|
||||
typedef float4 type;
|
||||
@ -273,6 +272,35 @@ template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a)
|
||||
return a.x * a.y;
|
||||
}
|
||||
|
||||
template<size_t offset>
|
||||
struct protate_impl<offset, float4>
|
||||
{
|
||||
static float4 run(const float4& a) {
|
||||
if (offset == 0) {
|
||||
return make_float4(a.x, a.y, a.z, a.w);
|
||||
}
|
||||
if (offset == 1) {
|
||||
return make_float4(a.w, a.x, a.y, a.z);
|
||||
}
|
||||
if (offset == 2) {
|
||||
return make_float4(a.z, a.w, a.x, a.y);
|
||||
}
|
||||
return make_float4(a.y, a.z, a.w, a.x);
|
||||
}
|
||||
};
|
||||
|
||||
template<size_t offset>
|
||||
struct protate_impl<offset, double2>
|
||||
{
|
||||
static double2 run(const double2& a) {
|
||||
if (offset == 0) {
|
||||
return make_double2(a.x, a.y);
|
||||
}
|
||||
return make_double2(a.y, a.x);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
|
||||
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
|
||||
}
|
||||
|
237
Eigen/src/Core/arch/CUDA/PacketMathHalf.h
Normal file
237
Eigen/src/Core/arch/CUDA/PacketMathHalf.h
Normal file
@ -0,0 +1,237 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
|
||||
__device__ half operator + (const half& a, const half& b) {
|
||||
return __hadd(a, b);
|
||||
}
|
||||
__device__ half operator * (const half& a, const half& b) {
|
||||
return __hmul(a, b);
|
||||
}
|
||||
__device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
}
|
||||
__device__ half operator / (const half& a, const half& b) {
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
}
|
||||
__device__ half operator - (const half& a) {
|
||||
return __hneg(a);
|
||||
}
|
||||
__device__ half operator += (half& a, const half& b) {
|
||||
a = __hadd(a, b);
|
||||
return a;
|
||||
}
|
||||
__device__ half operator *= (half& a, const half& b) {
|
||||
a = __hmul(a, b);
|
||||
return a;
|
||||
}
|
||||
__device__ half operator -= (half& a, const half& b) {
|
||||
a = __hsub(a, b);
|
||||
return a;
|
||||
}
|
||||
__device__ half operator /= (half& a, const half& b) {
|
||||
a = a / b;
|
||||
return a;
|
||||
}
|
||||
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
template<> struct packet_traits<half> : default_packet_traits
|
||||
{
|
||||
typedef half2 type;
|
||||
typedef half2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=2,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasDiv = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
|
||||
HasBlend = 0,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
|
||||
return __half2half2(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
|
||||
return __halves2half2(a, __hadd(a, __float2half(1)));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
return __hadd2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
return __hsub2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
return __hneg2(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
return __hmul2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
return __hfma2(a, b, c);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
||||
half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
||||
half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
|
||||
return *reinterpret_cast<const half2*>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) {
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
|
||||
*reinterpret_cast<half2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
|
||||
return __ldg((const half2*)from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
|
||||
half first = __low2half(a);
|
||||
half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
|
||||
half first = __low2half(a);
|
||||
half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
|
||||
assert(false && "tbd");
|
||||
return half2();
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
assert(false && "tbd");
|
||||
// half tmp = kernel.packet[0].y;
|
||||
// kernel.packet[0].y = kernel.packet[1].x;
|
||||
// kernel.packet[1].x = tmp;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
|
123
Eigen/src/Core/arch/CUDA/TypeCasting.h
Normal file
123
Eigen/src/Core/arch/CUDA/TypeCasting.h
Normal file
@ -0,0 +1,123 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TYPE_CASTING_CUDA_H
|
||||
#define EIGEN_TYPE_CASTING_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
template<>
|
||||
struct scalar_cast_op<float, half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const float& a) const {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __float2half(a);
|
||||
#else
|
||||
assert(false && "tbd");
|
||||
return half();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct functor_traits<scalar_cast_op<float, half> >
|
||||
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
|
||||
|
||||
|
||||
template<>
|
||||
struct scalar_cast_op<int, half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half operator() (const int& a) const {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
assert(false && "tbd");
|
||||
return half();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct functor_traits<scalar_cast_op<int, half> >
|
||||
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
|
||||
|
||||
|
||||
template<>
|
||||
struct scalar_cast_op<half, float> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef float result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const half& a) const {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __half2float(a);
|
||||
#else
|
||||
assert(false && "tbd");
|
||||
return 0.0f;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct functor_traits<scalar_cast_op<half, float> >
|
||||
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
|
||||
|
||||
|
||||
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<half, float> {
|
||||
enum {
|
||||
VectorizedCast = 1,
|
||||
SrcCoeffRatio = 2,
|
||||
TgtCoeffRatio = 1
|
||||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
float2 r1 = __half22float2(a);
|
||||
float2 r2 = __half22float2(b);
|
||||
return make_float4(r1.x, r1.y, r2.x, r2.y);
|
||||
#else
|
||||
assert(false && "tbd");
|
||||
return float4();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<float, half> {
|
||||
enum {
|
||||
VectorizedCast = 1,
|
||||
SrcCoeffRatio = 1,
|
||||
TgtCoeffRatio = 2
|
||||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
|
||||
// Simply discard the second half of the input
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
return __float22half2_rn(make_float2(a.x, a.y));
|
||||
#else
|
||||
assert(false && "tbd");
|
||||
return half2();
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_TYPE_CASTING_CUDA_H
|
@ -518,30 +518,31 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x) {
|
||||
|
||||
// Hyperbolic Tangent function.
|
||||
// Doesn't do anything fancy, just a 13/6-degree rational interpolant which
|
||||
// is accurate up to a couple of ulp in the range [-8, 8], outside of which the
|
||||
// is accurate up to a couple of ulp in the range [-9, 9], outside of which the
|
||||
// fl(tanh(x)) = +/-1.
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
ptanh<Packet4f>(const Packet4f& _x) {
|
||||
// Map the range [-8, 8] to [-1, 1], we will clamp bad coefficients later.
|
||||
const Packet4f x =
|
||||
pmax(pset1<Packet4f>(-1.0f),
|
||||
pmin(pset1<Packet4f>(1.0f), pmul(_x, pset1<Packet4f>(0.125f))));
|
||||
// Clamp the inputs to the range [-9, 9] since anything outside
|
||||
// this range is +/-1.0f in single-precision.
|
||||
_EIGEN_DECLARE_CONST_Packet4f(plus_9, 9.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(minus_9, -9.0f);
|
||||
const Packet4f x = pmax(p4f_minus_9, pmin(p4f_plus_9, _x));
|
||||
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_1, -2.47030171958948e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_3, -2.06804010015822e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_5, -3.13693994587418e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_7, -7.19851201683627e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_9, 8.31561269687160e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_11, -1.37626659546502e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_13, 1.39116714700458e-05f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-05f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-08f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
|
||||
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_0, -3.08787724141615e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_2, -9.17251911622436e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_4, -3.09625062090444e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_6, -2.05669680763032e-02f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-03f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-04f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-06f);
|
||||
|
||||
// Since the polynomials are odd/even, we need x^2.
|
||||
const Packet4f x2 = pmul(x, x);
|
||||
|
@ -42,7 +42,7 @@ template <typename T, size_t n> class array {
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array() { }
|
||||
explicit EIGEN_DEVICE_FUNC
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE array(const T& v) {
|
||||
EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
values[0] = v;
|
||||
|
@ -99,23 +99,23 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
|
||||
#define prefetchIntoRegisters(base_k) \
|
||||
{ \
|
||||
lhs_pf0 = Scalar(0); \
|
||||
lhs_pf1 = Scalar(0); \
|
||||
lhs_pf2 = Scalar(0); \
|
||||
lhs_pf3 = Scalar(0); \
|
||||
lhs_pf4 = Scalar(0); \
|
||||
lhs_pf5 = Scalar(0); \
|
||||
lhs_pf6 = Scalar(0); \
|
||||
lhs_pf7 = Scalar(0); \
|
||||
lhs_pf0 = conv(0); \
|
||||
lhs_pf1 = conv(0); \
|
||||
lhs_pf2 = conv(0); \
|
||||
lhs_pf3 = conv(0); \
|
||||
lhs_pf4 = conv(0); \
|
||||
lhs_pf5 = conv(0); \
|
||||
lhs_pf6 = conv(0); \
|
||||
lhs_pf7 = conv(0); \
|
||||
\
|
||||
rhs_pf0 = Scalar(0); \
|
||||
rhs_pf1 = Scalar(0); \
|
||||
rhs_pf2 = Scalar(0); \
|
||||
rhs_pf3 = Scalar(0); \
|
||||
rhs_pf4 = Scalar(0); \
|
||||
rhs_pf5 = Scalar(0); \
|
||||
rhs_pf6 = Scalar(0); \
|
||||
rhs_pf7 = Scalar(0); \
|
||||
rhs_pf0 = conv(0); \
|
||||
rhs_pf1 = conv(0); \
|
||||
rhs_pf2 = conv(0); \
|
||||
rhs_pf3 = conv(0); \
|
||||
rhs_pf4 = conv(0); \
|
||||
rhs_pf5 = conv(0); \
|
||||
rhs_pf6 = conv(0); \
|
||||
rhs_pf7 = conv(0); \
|
||||
\
|
||||
if (!needs_edge_check || lhs_vert < m_size) { \
|
||||
const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
|
||||
@ -261,15 +261,16 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
// declare and initialize result array
|
||||
#define res(i, j) _res_##i##j
|
||||
#define initResultRow(i) \
|
||||
Scalar res(i, 0) = Scalar(0); \
|
||||
Scalar res(i, 1) = Scalar(0); \
|
||||
Scalar res(i, 2) = Scalar(0); \
|
||||
Scalar res(i, 3) = Scalar(0); \
|
||||
Scalar res(i, 4) = Scalar(0); \
|
||||
Scalar res(i, 5) = Scalar(0); \
|
||||
Scalar res(i, 6) = Scalar(0); \
|
||||
Scalar res(i, 7) = Scalar(0); \
|
||||
Scalar res(i, 0) = conv(0); \
|
||||
Scalar res(i, 1) = conv(0); \
|
||||
Scalar res(i, 2) = conv(0); \
|
||||
Scalar res(i, 3) = conv(0); \
|
||||
Scalar res(i, 4) = conv(0); \
|
||||
Scalar res(i, 5) = conv(0); \
|
||||
Scalar res(i, 6) = conv(0); \
|
||||
Scalar res(i, 7) = conv(0); \
|
||||
|
||||
internal::scalar_cast_op<int, Scalar> conv;
|
||||
initResultRow(0);
|
||||
initResultRow(1);
|
||||
initResultRow(2);
|
||||
@ -1313,6 +1314,34 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
|
||||
static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
|
||||
const Index m_blocks = (m + 63) / 64;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 8, 8);
|
||||
LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
|
||||
static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
|
||||
if (m < 768 || n < 768) {
|
||||
const Index m_blocks = (m + 63) / 64;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(16, 16, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
} else {
|
||||
const Index m_blocks = (m + 127) / 128;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 32, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
void evalTyped(Scalar* buffer) const {
|
||||
// columns in left side, rows in right side
|
||||
@ -1353,28 +1382,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
OutputMapper output(buffer, m);
|
||||
|
||||
setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
|
||||
if (internal::is_same<LhsScalar, float>::value &&
|
||||
internal::is_same<RhsScalar, float>::value) {
|
||||
if (m < 768 || n < 768) {
|
||||
const Index m_blocks = (m + 63) / 64;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(16, 16, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
|
||||
} else {
|
||||
const Index m_blocks = (m + 127) / 128;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 32, 1);
|
||||
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
} else {
|
||||
const Index m_blocks = (m + 63) / 64;
|
||||
const Index n_blocks = (n + 63) / 64;
|
||||
const dim3 num_blocks(m_blocks, n_blocks, 1);
|
||||
const dim3 block_size(8, 8, 8);
|
||||
LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
|
||||
}
|
||||
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -124,9 +124,12 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
|
||||
return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
|
||||
} else {
|
||||
const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
|
||||
typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
|
||||
typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
|
||||
internal::scalar_cast_op<SrcType, TgtType> converter;
|
||||
EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
|
||||
for (int i = 0; i < TgtPacketSize; ++i) {
|
||||
values[i] = m_impl.coeff(index+i);
|
||||
values[i] = converter(m_impl.coeff(index+i));
|
||||
}
|
||||
TgtPacket rslt = internal::pload<TgtPacket>(values);
|
||||
return rslt;
|
||||
|
@ -34,12 +34,23 @@ static void initializeDeviceProp() {
|
||||
if (!m_devicePropInitialized) {
|
||||
int num_devices;
|
||||
cudaError_t status = cudaGetDeviceCount(&num_devices);
|
||||
EIGEN_UNUSED_VARIABLE(status)
|
||||
assert(status == cudaSuccess);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to get the number of CUDA devices: "
|
||||
<< cudaGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
m_deviceProperties = new cudaDeviceProp[num_devices];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
|
||||
assert(status == cudaSuccess);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to initialize CUDA device #"
|
||||
<< i
|
||||
<< ": "
|
||||
<< cudaGetErrorString(status)
|
||||
<< std::endl;
|
||||
assert(status == cudaSuccess);
|
||||
}
|
||||
}
|
||||
m_devicePropInitialized = true;
|
||||
}
|
||||
@ -247,6 +258,14 @@ struct GpuDevice {
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
|
||||
#ifndef __CUDA_ARCH__
|
||||
return stream_->deviceProperties().minor;
|
||||
#else
|
||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
|
||||
return max_blocks_;
|
||||
|
@ -72,11 +72,12 @@ template <typename T> struct SumReducer
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return static_cast<T>(0);
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(0);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(0);
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
@ -110,11 +111,12 @@ template <typename T> struct MeanReducer
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return static_cast<T>(0);
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(0);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(0);
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum / scalarCount_;
|
||||
@ -214,11 +216,12 @@ template <typename T> struct ProdReducer
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
|
||||
return static_cast<T>(1);
|
||||
internal::scalar_cast_op<int, T> conv;
|
||||
return conv(1);
|
||||
}
|
||||
template <typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
|
||||
return pset1<Packet>(1);
|
||||
return pset1<Packet>(initialize());
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
|
||||
return accum;
|
||||
|
@ -173,5 +173,9 @@ if(CUDA_FOUND)
|
||||
ei_add_test(cxx11_tensor_random_cuda)
|
||||
ei_add_test(cxx11_tensor_argmax_cuda)
|
||||
|
||||
set(CUDA_NVCC_FLAGS "-std=c++11 --relaxed-constexpr -arch compute_53 -Xcudafe \"--display_error_number\"")
|
||||
ei_add_test(cxx11_tensor_of_float16_cuda)
|
||||
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
endif()
|
||||
|
208
unsupported/test/cxx11_tensor_of_float16_cuda.cu
Normal file
208
unsupported/test/cxx11_tensor_of_float16_cuda.cu
Normal file
@ -0,0 +1,208 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::Tensor;
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
|
||||
void test_cuda_conversion() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half));
|
||||
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||
d_float, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<half, 1>, Eigen::Aligned> gpu_half(
|
||||
d_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||
d_conv, num_elem);
|
||||
|
||||
gpu_float.device(gpu_device) = gpu_float.random();
|
||||
gpu_half.device(gpu_device) = gpu_float.cast<half>();
|
||||
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
|
||||
|
||||
Tensor<float, 1> initial(num_elem);
|
||||
Tensor<float, 1> final(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(initial(i), final(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float);
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
void test_cuda_elementwise() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
|
||||
gpu_res_half.device(gpu_device) = ((gpu_float1.cast<half>() + gpu_float2.cast<half>()) * gpu_float1.cast<half>()).cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
/*
|
||||
void test_cuda_contractions() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int rows = 101;
|
||||
int cols = 101;
|
||||
int num_elem = rows*cols;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, rows, cols);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, rows, cols);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
|
||||
typedef Tensor<float, 2>::DimensionPair DimPair;
|
||||
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims);
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<half>().contract(gpu_float2.cast<half>(), dims).cast<float>();
|
||||
|
||||
Tensor<float, 2> half_prec(rows, cols);
|
||||
Tensor<float, 2> full_prec(rows, cols);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
for (int j = 0; j < cols; ++j) {
|
||||
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
|
||||
}
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}*/
|
||||
|
||||
|
||||
void test_cuda_reductions() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int size = 101;
|
||||
int num_elem = size*size;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(size * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(size * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, size, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, size);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, size);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
|
||||
Eigen::array<int, 1> redux_dim = {{0}};
|
||||
gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim);
|
||||
gpu_res_half.device(gpu_device) = gpu_float1.cast<half>().sum(redux_dim).cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(size);
|
||||
Tensor<float, 1> full_prec(size);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
void test_cxx11_tensor_of_float16_cuda()
|
||||
{
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice device(&stream);
|
||||
if (device.majorDeviceVersion() > 5 ||
|
||||
(device.majorDeviceVersion() == 5 && device.minorDeviceVersion() >= 3)) {
|
||||
CALL_SUBTEST_1(test_cuda_conversion());
|
||||
CALL_SUBTEST_1(test_cuda_elementwise());
|
||||
// CALL_SUBTEST_2(test_cuda_contractions());
|
||||
CALL_SUBTEST_3(test_cuda_reductions());
|
||||
}
|
||||
else {
|
||||
std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;
|
||||
}
|
||||
#else
|
||||
std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
|
||||
#endif
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user