[SYCL-2020] Enabling half precision support for SYCL.

This commit is contained in:
Alejandro Acosta 2023-04-27 14:01:11 +01:00 committed by Alejandro Acosta
parent 92a77a596b
commit ba47341a14
23 changed files with 873 additions and 28 deletions

View File

@ -86,6 +86,8 @@ struct sycl_packet_traits : default_packet_traits {
typedef packet_type half; \ typedef packet_type half; \
}; };
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
@ -100,6 +102,7 @@ SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
struct is_arithmetic<packet_type> { \ struct is_arithmetic<packet_type> { \
enum { value = true }; \ enum { value = true }; \
}; };
SYCL_ARITHMETIC(cl::sycl::cl_half8)
SYCL_ARITHMETIC(cl::sycl::cl_float4) SYCL_ARITHMETIC(cl::sycl::cl_float4)
SYCL_ARITHMETIC(cl::sycl::cl_double2) SYCL_ARITHMETIC(cl::sycl::cl_double2)
#undef SYCL_ARITHMETIC #undef SYCL_ARITHMETIC
@ -111,6 +114,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \ enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
typedef packet_type half; \ typedef packet_type half; \
}; };
SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)

View File

@ -38,6 +38,7 @@ namespace internal {
return cl::sycl::log(a); \ return cl::sycl::log(a); \
} }
SYCL_PLOG(cl::sycl::cl_half8)
SYCL_PLOG(cl::sycl::cl_float4) SYCL_PLOG(cl::sycl::cl_float4)
SYCL_PLOG(cl::sycl::cl_double2) SYCL_PLOG(cl::sycl::cl_double2)
#undef SYCL_PLOG #undef SYCL_PLOG
@ -49,6 +50,7 @@ SYCL_PLOG(cl::sycl::cl_double2)
return cl::sycl::log1p(a); \ return cl::sycl::log1p(a); \
} }
SYCL_PLOG1P(cl::sycl::cl_half8)
SYCL_PLOG1P(cl::sycl::cl_float4) SYCL_PLOG1P(cl::sycl::cl_float4)
SYCL_PLOG1P(cl::sycl::cl_double2) SYCL_PLOG1P(cl::sycl::cl_double2)
#undef SYCL_PLOG1P #undef SYCL_PLOG1P
@ -60,6 +62,7 @@ SYCL_PLOG1P(cl::sycl::cl_double2)
return cl::sycl::log10(a); \ return cl::sycl::log10(a); \
} }
SYCL_PLOG10(cl::sycl::cl_half8)
SYCL_PLOG10(cl::sycl::cl_float4) SYCL_PLOG10(cl::sycl::cl_float4)
SYCL_PLOG10(cl::sycl::cl_double2) SYCL_PLOG10(cl::sycl::cl_double2)
#undef SYCL_PLOG10 #undef SYCL_PLOG10
@ -71,6 +74,8 @@ SYCL_PLOG10(cl::sycl::cl_double2)
return cl::sycl::exp(a); \ return cl::sycl::exp(a); \
} }
SYCL_PEXP(cl::sycl::cl_half8)
SYCL_PEXP(cl::sycl::cl_half)
SYCL_PEXP(cl::sycl::cl_float4) SYCL_PEXP(cl::sycl::cl_float4)
SYCL_PEXP(cl::sycl::cl_float) SYCL_PEXP(cl::sycl::cl_float)
SYCL_PEXP(cl::sycl::cl_double2) SYCL_PEXP(cl::sycl::cl_double2)
@ -83,6 +88,7 @@ SYCL_PEXP(cl::sycl::cl_double2)
return cl::sycl::expm1(a); \ return cl::sycl::expm1(a); \
} }
SYCL_PEXPM1(cl::sycl::cl_half8)
SYCL_PEXPM1(cl::sycl::cl_float4) SYCL_PEXPM1(cl::sycl::cl_float4)
SYCL_PEXPM1(cl::sycl::cl_double2) SYCL_PEXPM1(cl::sycl::cl_double2)
#undef SYCL_PEXPM1 #undef SYCL_PEXPM1
@ -94,6 +100,7 @@ SYCL_PEXPM1(cl::sycl::cl_double2)
return cl::sycl::sqrt(a); \ return cl::sycl::sqrt(a); \
} }
SYCL_PSQRT(cl::sycl::cl_half8)
SYCL_PSQRT(cl::sycl::cl_float4) SYCL_PSQRT(cl::sycl::cl_float4)
SYCL_PSQRT(cl::sycl::cl_double2) SYCL_PSQRT(cl::sycl::cl_double2)
#undef SYCL_PSQRT #undef SYCL_PSQRT
@ -105,6 +112,7 @@ SYCL_PSQRT(cl::sycl::cl_double2)
return cl::sycl::rsqrt(a); \ return cl::sycl::rsqrt(a); \
} }
SYCL_PRSQRT(cl::sycl::cl_half8)
SYCL_PRSQRT(cl::sycl::cl_float4) SYCL_PRSQRT(cl::sycl::cl_float4)
SYCL_PRSQRT(cl::sycl::cl_double2) SYCL_PRSQRT(cl::sycl::cl_double2)
#undef SYCL_PRSQRT #undef SYCL_PRSQRT
@ -117,6 +125,7 @@ SYCL_PRSQRT(cl::sycl::cl_double2)
return cl::sycl::sin(a); \ return cl::sycl::sin(a); \
} }
SYCL_PSIN(cl::sycl::cl_half8)
SYCL_PSIN(cl::sycl::cl_float4) SYCL_PSIN(cl::sycl::cl_float4)
SYCL_PSIN(cl::sycl::cl_double2) SYCL_PSIN(cl::sycl::cl_double2)
#undef SYCL_PSIN #undef SYCL_PSIN
@ -129,6 +138,7 @@ SYCL_PSIN(cl::sycl::cl_double2)
return cl::sycl::cos(a); \ return cl::sycl::cos(a); \
} }
SYCL_PCOS(cl::sycl::cl_half8)
SYCL_PCOS(cl::sycl::cl_float4) SYCL_PCOS(cl::sycl::cl_float4)
SYCL_PCOS(cl::sycl::cl_double2) SYCL_PCOS(cl::sycl::cl_double2)
#undef SYCL_PCOS #undef SYCL_PCOS
@ -141,6 +151,7 @@ SYCL_PCOS(cl::sycl::cl_double2)
return cl::sycl::tan(a); \ return cl::sycl::tan(a); \
} }
SYCL_PTAN(cl::sycl::cl_half8)
SYCL_PTAN(cl::sycl::cl_float4) SYCL_PTAN(cl::sycl::cl_float4)
SYCL_PTAN(cl::sycl::cl_double2) SYCL_PTAN(cl::sycl::cl_double2)
#undef SYCL_PTAN #undef SYCL_PTAN
@ -153,6 +164,7 @@ SYCL_PTAN(cl::sycl::cl_double2)
return cl::sycl::asin(a); \ return cl::sycl::asin(a); \
} }
SYCL_PASIN(cl::sycl::cl_half8)
SYCL_PASIN(cl::sycl::cl_float4) SYCL_PASIN(cl::sycl::cl_float4)
SYCL_PASIN(cl::sycl::cl_double2) SYCL_PASIN(cl::sycl::cl_double2)
#undef SYCL_PASIN #undef SYCL_PASIN
@ -165,6 +177,7 @@ SYCL_PASIN(cl::sycl::cl_double2)
return cl::sycl::acos(a); \ return cl::sycl::acos(a); \
} }
SYCL_PACOS(cl::sycl::cl_half8)
SYCL_PACOS(cl::sycl::cl_float4) SYCL_PACOS(cl::sycl::cl_float4)
SYCL_PACOS(cl::sycl::cl_double2) SYCL_PACOS(cl::sycl::cl_double2)
#undef SYCL_PACOS #undef SYCL_PACOS
@ -177,6 +190,7 @@ SYCL_PACOS(cl::sycl::cl_double2)
return cl::sycl::atan(a); \ return cl::sycl::atan(a); \
} }
SYCL_PATAN(cl::sycl::cl_half8)
SYCL_PATAN(cl::sycl::cl_float4) SYCL_PATAN(cl::sycl::cl_float4)
SYCL_PATAN(cl::sycl::cl_double2) SYCL_PATAN(cl::sycl::cl_double2)
#undef SYCL_PATAN #undef SYCL_PATAN
@ -189,6 +203,7 @@ SYCL_PATAN(cl::sycl::cl_double2)
return cl::sycl::sinh(a); \ return cl::sycl::sinh(a); \
} }
SYCL_PSINH(cl::sycl::cl_half8)
SYCL_PSINH(cl::sycl::cl_float4) SYCL_PSINH(cl::sycl::cl_float4)
SYCL_PSINH(cl::sycl::cl_double2) SYCL_PSINH(cl::sycl::cl_double2)
#undef SYCL_PSINH #undef SYCL_PSINH
@ -201,6 +216,7 @@ SYCL_PSINH(cl::sycl::cl_double2)
return cl::sycl::cosh(a); \ return cl::sycl::cosh(a); \
} }
SYCL_PCOSH(cl::sycl::cl_half8)
SYCL_PCOSH(cl::sycl::cl_float4) SYCL_PCOSH(cl::sycl::cl_float4)
SYCL_PCOSH(cl::sycl::cl_double2) SYCL_PCOSH(cl::sycl::cl_double2)
#undef SYCL_PCOSH #undef SYCL_PCOSH
@ -213,6 +229,7 @@ SYCL_PCOSH(cl::sycl::cl_double2)
return cl::sycl::tanh(a); \ return cl::sycl::tanh(a); \
} }
SYCL_PTANH(cl::sycl::cl_half8)
SYCL_PTANH(cl::sycl::cl_float4) SYCL_PTANH(cl::sycl::cl_float4)
SYCL_PTANH(cl::sycl::cl_double2) SYCL_PTANH(cl::sycl::cl_double2)
#undef SYCL_PTANH #undef SYCL_PTANH
@ -224,6 +241,7 @@ SYCL_PTANH(cl::sycl::cl_double2)
return cl::sycl::ceil(a); \ return cl::sycl::ceil(a); \
} }
SYCL_PCEIL(cl::sycl::cl_half)
SYCL_PCEIL(cl::sycl::cl_float4) SYCL_PCEIL(cl::sycl::cl_float4)
SYCL_PCEIL(cl::sycl::cl_double2) SYCL_PCEIL(cl::sycl::cl_double2)
#undef SYCL_PCEIL #undef SYCL_PCEIL
@ -235,6 +253,7 @@ SYCL_PCEIL(cl::sycl::cl_double2)
return cl::sycl::round(a); \ return cl::sycl::round(a); \
} }
SYCL_PROUND(cl::sycl::cl_half8)
SYCL_PROUND(cl::sycl::cl_float4) SYCL_PROUND(cl::sycl::cl_float4)
SYCL_PROUND(cl::sycl::cl_double2) SYCL_PROUND(cl::sycl::cl_double2)
#undef SYCL_PROUND #undef SYCL_PROUND
@ -246,6 +265,7 @@ SYCL_PROUND(cl::sycl::cl_double2)
return cl::sycl::rint(a); \ return cl::sycl::rint(a); \
} }
SYCL_PRINT(cl::sycl::cl_half8)
SYCL_PRINT(cl::sycl::cl_float4) SYCL_PRINT(cl::sycl::cl_float4)
SYCL_PRINT(cl::sycl::cl_double2) SYCL_PRINT(cl::sycl::cl_double2)
#undef SYCL_PRINT #undef SYCL_PRINT
@ -257,6 +277,7 @@ SYCL_PRINT(cl::sycl::cl_double2)
return cl::sycl::floor(a); \ return cl::sycl::floor(a); \
} }
SYCL_FLOOR(cl::sycl::cl_half8)
SYCL_FLOOR(cl::sycl::cl_float4) SYCL_FLOOR(cl::sycl::cl_float4)
SYCL_FLOOR(cl::sycl::cl_double2) SYCL_FLOOR(cl::sycl::cl_double2)
#undef SYCL_FLOOR #undef SYCL_FLOOR
@ -268,6 +289,7 @@ SYCL_FLOOR(cl::sycl::cl_double2)
return expr; \ return expr; \
} }
SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
#undef SYCL_PMIN #undef SYCL_PMIN
@ -279,6 +301,7 @@ SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
return expr; \ return expr; \
} }
SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
#undef SYCL_PMAX #undef SYCL_PMAX
@ -292,6 +315,7 @@ SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
cl::sycl::rounding_mode::automatic>()); \ cl::sycl::rounding_mode::automatic>()); \
} }
SYCL_PLDEXP(cl::sycl::cl_half8)
SYCL_PLDEXP(cl::sycl::cl_float4) SYCL_PLDEXP(cl::sycl::cl_float4)
SYCL_PLDEXP(cl::sycl::cl_double2) SYCL_PLDEXP(cl::sycl::cl_double2)
#undef SYCL_PLDEXP #undef SYCL_PLDEXP

View File

@ -44,9 +44,34 @@ SYCL_PLOAD(cl::sycl::cl_float4, u)
SYCL_PLOAD(cl::sycl::cl_float4, ) SYCL_PLOAD(cl::sycl::cl_float4, )
SYCL_PLOAD(cl::sycl::cl_double2, u) SYCL_PLOAD(cl::sycl::cl_double2, u)
SYCL_PLOAD(cl::sycl::cl_double2, ) SYCL_PLOAD(cl::sycl::cl_double2, )
#undef SYCL_PLOAD #undef SYCL_PLOAD
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
pload<cl::sycl::cl_half8>(
const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
auto ptr = cl::sycl::address_space_cast<
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>(
reinterpret_cast<const cl::sycl::cl_half*>(from));
cl::sycl::cl_half8 res{};
res.load(0, ptr);
return res;
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
ploadu<cl::sycl::cl_half8>(
const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
auto ptr = cl::sycl::address_space_cast<
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>(
reinterpret_cast<const cl::sycl::cl_half*>(from));
cl::sycl::cl_half8 res{};
res.load(0, ptr);
return res;
}
#define SYCL_PSTORE(scalar, packet_type, alignment) \ #define SYCL_PSTORE(scalar, packet_type, alignment) \
template <> \ template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
@ -59,9 +84,28 @@ SYCL_PSTORE(float, cl::sycl::cl_float4, )
SYCL_PSTORE(float, cl::sycl::cl_float4, u) SYCL_PSTORE(float, cl::sycl::cl_float4, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, ) SYCL_PSTORE(double, cl::sycl::cl_double2, )
SYCL_PSTORE(double, cl::sycl::cl_double2, u) SYCL_PSTORE(double, cl::sycl::cl_double2, u)
#undef SYCL_PSTORE #undef SYCL_PSTORE
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
Eigen::half* to, const cl::sycl::cl_half8& from) {
auto ptr = cl::sycl::address_space_cast<
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>(
reinterpret_cast<cl::sycl::cl_half*>(to));
from.store(0, ptr);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
Eigen::half* to, const cl::sycl::cl_half8& from) {
auto ptr = cl::sycl::address_space_cast<
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>(
reinterpret_cast<cl::sycl::cl_half*>(to));
from.store(0, ptr);
}
#define SYCL_PSET1(packet_type) \ #define SYCL_PSET1(packet_type) \
template <> \ template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@ -70,6 +114,7 @@ SYCL_PSTORE(double, cl::sycl::cl_double2, u)
} }
// global space // global space
SYCL_PSET1(cl::sycl::cl_half8)
SYCL_PSET1(cl::sycl::cl_float4) SYCL_PSET1(cl::sycl::cl_float4)
SYCL_PSET1(cl::sycl::cl_double2) SYCL_PSET1(cl::sycl::cl_double2)
@ -86,6 +131,58 @@ struct get_base_packet {
get_pgather(sycl_multi_pointer, Index) {} get_pgather(sycl_multi_pointer, Index) {}
}; };
template <>
struct get_base_packet<cl::sycl::cl_half8> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
sycl_multi_pointer from) {
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
static_cast<cl::sycl::half>(from[0]),
static_cast<cl::sycl::half>(from[1]),
static_cast<cl::sycl::half>(from[1]),
static_cast<cl::sycl::half>(from[2]),
static_cast<cl::sycl::half>(from[2]),
static_cast<cl::sycl::half>(from[3]),
static_cast<cl::sycl::half>(from[3]));
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
sycl_multi_pointer from, Index stride) {
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
static_cast<cl::sycl::half>(from[1 * stride]),
static_cast<cl::sycl::half>(from[2 * stride]),
static_cast<cl::sycl::half>(from[3 * stride]),
static_cast<cl::sycl::half>(from[4 * stride]),
static_cast<cl::sycl::half>(from[5 * stride]),
static_cast<cl::sycl::half>(from[6 * stride]),
static_cast<cl::sycl::half>(from[7 * stride]));
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
auto tmp = stride;
to[0] = Eigen::half(from.s0());
to[tmp] = Eigen::half(from.s1());
to[tmp += stride] = Eigen::half(from.s2());
to[tmp += stride] = Eigen::half(from.s3());
to[tmp += stride] = Eigen::half(from.s4());
to[tmp += stride] = Eigen::half(from.s5());
to[tmp += stride] = Eigen::half(from.s6());
to[tmp += stride] = Eigen::half(from.s7());
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
const cl::sycl::half& a) {
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
static_cast<cl::sycl::half>(a + 2),
static_cast<cl::sycl::half>(a + 3),
static_cast<cl::sycl::half>(a + 4),
static_cast<cl::sycl::half>(a + 5),
static_cast<cl::sycl::half>(a + 6),
static_cast<cl::sycl::half>(a + 7));
}
};
template <> template <>
struct get_base_packet<cl::sycl::cl_float4> { struct get_base_packet<cl::sycl::cl_float4> {
template <typename sycl_multi_pointer> template <typename sycl_multi_pointer>
@ -152,6 +249,7 @@ struct get_base_packet<cl::sycl::cl_double2> {
return get_base_packet<packet_type>::get_ploaddup(from); \ return get_base_packet<packet_type>::get_ploaddup(from); \
} }
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
@ -165,9 +263,14 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
} }
SYCL_PLSET(cl::sycl::cl_float4) SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2) SYCL_PLSET(cl::sycl::cl_double2)
#undef SYCL_PLSET #undef SYCL_PLSET
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
}
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ #define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
template <> \ template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
@ -176,9 +279,9 @@ SYCL_PLSET(cl::sycl::cl_double2)
return get_base_packet<packet_type>::get_pgather(from, stride); \ return get_base_packet<packet_type>::get_pgather(from, stride); \
} }
SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PGATHER_SPECILIZE #undef SYCL_PGATHER_SPECILIZE
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
@ -189,6 +292,7 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
get_base_packet<packet_type>::set_pscatter(to, from, stride); \ get_base_packet<packet_type>::set_pscatter(to, from, stride); \
} }
SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
@ -201,10 +305,16 @@ SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
return cl::sycl::mad(a, b, c); \ return cl::sycl::mad(a, b, c); \
} }
SYCL_PMAD(cl::sycl::cl_half8)
SYCL_PMAD(cl::sycl::cl_float4) SYCL_PMAD(cl::sycl::cl_float4)
SYCL_PMAD(cl::sycl::cl_double2) SYCL_PMAD(cl::sycl::cl_double2)
#undef SYCL_PMAD #undef SYCL_PMAD
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
const cl::sycl::cl_half8& a) {
return Eigen::half(a.s0());
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) { const cl::sycl::cl_float4& a) {
@ -216,6 +326,13 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
return a.x(); return a.x();
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
const cl::sycl::cl_half8& a) {
return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
+ a.s6() + a.s7());
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) { const cl::sycl::cl_float4& a) {
@ -228,6 +345,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
return a.x() + a.y(); return a.x() + a.y();
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
const cl::sycl::cl_half8& a) {
return Eigen::half(cl::sycl::fmax(
cl::sycl::fmax(
cl::sycl::fmax(a.s0(), a.s1()),
cl::sycl::fmax(a.s2(), a.s3())),
cl::sycl::fmax(
cl::sycl::fmax(a.s4(), a.s5()),
cl::sycl::fmax(a.s6(), a.s7()))));
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) { const cl::sycl::cl_float4& a) {
@ -240,6 +368,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
return cl::sycl::fmax(a.x(), a.y()); return cl::sycl::fmax(a.x(), a.y());
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
const cl::sycl::cl_half8& a) {
return Eigen::half(cl::sycl::fmin(
cl::sycl::fmin(
cl::sycl::fmin(a.s0(), a.s1()),
cl::sycl::fmin(a.s2(), a.s3())),
cl::sycl::fmin(
cl::sycl::fmin(a.s4(), a.s5()),
cl::sycl::fmin(a.s6(), a.s7()))));
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) { const cl::sycl::cl_float4& a) {
@ -252,6 +391,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
return cl::sycl::fmin(a.x(), a.y()); return cl::sycl::fmin(a.x(), a.y());
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
const cl::sycl::cl_half8& a) {
return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
a.s6() * a.s7());
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
const cl::sycl::cl_float4& a) { const cl::sycl::cl_float4& a) {
@ -263,6 +408,14 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
return a.x() * a.y(); return a.x() * a.y();
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) { pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
@ -300,6 +453,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
return sycl_pcmp_##OP<TYPE>(a, b); \ return sycl_pcmp_##OP<TYPE>(a, b); \
} }
SYCL_PCMP(le, cl::sycl::cl_half8)
SYCL_PCMP(lt, cl::sycl::cl_half8)
SYCL_PCMP(eq, cl::sycl::cl_half8)
SYCL_PCMP(le, cl::sycl::cl_float4) SYCL_PCMP(le, cl::sycl::cl_float4)
SYCL_PCMP(lt, cl::sycl::cl_float4) SYCL_PCMP(lt, cl::sycl::cl_float4)
SYCL_PCMP(eq, cl::sycl::cl_float4) SYCL_PCMP(eq, cl::sycl::cl_float4)
@ -308,6 +464,121 @@ SYCL_PCMP(lt, cl::sycl::cl_double2)
SYCL_PCMP(eq, cl::sycl::cl_double2) SYCL_PCMP(eq, cl::sycl::cl_double2)
#undef SYCL_PCMP #undef SYCL_PCMP
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
cl::sycl::cl_half tmp = kernel.packet[0].s1();
kernel.packet[0].s1() = kernel.packet[1].s0();
kernel.packet[1].s0() = tmp;
tmp = kernel.packet[0].s2();
kernel.packet[0].s2() = kernel.packet[2].s0();
kernel.packet[2].s0() = tmp;
tmp = kernel.packet[0].s3();
kernel.packet[0].s3() = kernel.packet[3].s0();
kernel.packet[3].s0() = tmp;
tmp = kernel.packet[0].s4();
kernel.packet[0].s4() = kernel.packet[4].s0();
kernel.packet[4].s0() = tmp;
tmp = kernel.packet[0].s5();
kernel.packet[0].s5() = kernel.packet[5].s0();
kernel.packet[5].s0() = tmp;
tmp = kernel.packet[0].s6();
kernel.packet[0].s6() = kernel.packet[6].s0();
kernel.packet[6].s0() = tmp;
tmp = kernel.packet[0].s7();
kernel.packet[0].s7() = kernel.packet[7].s0();
kernel.packet[7].s0() = tmp;
tmp = kernel.packet[1].s2();
kernel.packet[1].s2() = kernel.packet[2].s1();
kernel.packet[2].s1() = tmp;
tmp = kernel.packet[1].s3();
kernel.packet[1].s3() = kernel.packet[3].s1();
kernel.packet[3].s1() = tmp;
tmp = kernel.packet[1].s4();
kernel.packet[1].s4() = kernel.packet[4].s1();
kernel.packet[4].s1() = tmp;
tmp = kernel.packet[1].s5();
kernel.packet[1].s5() = kernel.packet[5].s1();
kernel.packet[5].s1() = tmp;
tmp = kernel.packet[1].s6();
kernel.packet[1].s6() = kernel.packet[6].s1();
kernel.packet[6].s1() = tmp;
tmp = kernel.packet[1].s7();
kernel.packet[1].s7() = kernel.packet[7].s1();
kernel.packet[7].s1() = tmp;
tmp = kernel.packet[2].s3();
kernel.packet[2].s3() = kernel.packet[3].s2();
kernel.packet[3].s2() = tmp;
tmp = kernel.packet[2].s4();
kernel.packet[2].s4() = kernel.packet[4].s2();
kernel.packet[4].s2() = tmp;
tmp = kernel.packet[2].s5();
kernel.packet[2].s5() = kernel.packet[5].s2();
kernel.packet[5].s2() = tmp;
tmp = kernel.packet[2].s6();
kernel.packet[2].s6() = kernel.packet[6].s2();
kernel.packet[6].s2() = tmp;
tmp = kernel.packet[2].s7();
kernel.packet[2].s7() = kernel.packet[7].s2();
kernel.packet[7].s2() = tmp;
tmp = kernel.packet[3].s4();
kernel.packet[3].s4() = kernel.packet[4].s3();
kernel.packet[4].s3() = tmp;
tmp = kernel.packet[3].s5();
kernel.packet[3].s5() = kernel.packet[5].s3();
kernel.packet[5].s3() = tmp;
tmp = kernel.packet[3].s6();
kernel.packet[3].s6() = kernel.packet[6].s3();
kernel.packet[6].s3() = tmp;
tmp = kernel.packet[3].s7();
kernel.packet[3].s7() = kernel.packet[7].s3();
kernel.packet[7].s3() = tmp;
tmp = kernel.packet[4].s5();
kernel.packet[4].s5() = kernel.packet[5].s4();
kernel.packet[5].s4() = tmp;
tmp = kernel.packet[4].s6();
kernel.packet[4].s6() = kernel.packet[6].s4();
kernel.packet[6].s4() = tmp;
tmp = kernel.packet[4].s7();
kernel.packet[4].s7() = kernel.packet[7].s4();
kernel.packet[7].s4() = tmp;
tmp = kernel.packet[5].s6();
kernel.packet[5].s6() = kernel.packet[6].s5();
kernel.packet[6].s5() = tmp;
tmp = kernel.packet[5].s7();
kernel.packet[5].s7() = kernel.packet[7].s5();
kernel.packet[7].s5() = tmp;
tmp = kernel.packet[6].s7();
kernel.packet[6].s7() = kernel.packet[7].s6();
kernel.packet[7].s6() = tmp;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
PacketBlock<cl::sycl::cl_float4, 4>& kernel) { PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
float tmp = kernel.packet[0].y(); float tmp = kernel.packet[0].y();
@ -342,6 +613,19 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
kernel.packet[1].x() = tmp; kernel.packet[1].x() = tmp;
} }
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
const cl::sycl::cl_half8& thenPacket,
const cl::sycl::cl_half8& elsePacket) {
cl::sycl::cl_short8 condition(
ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend( EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket, const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,

View File

@ -1118,7 +1118,7 @@ struct GeneralVectorTensor {
: globalContractDimOffset + privateOffsetC) : globalContractDimOffset + privateOffsetC)
: OutScalar(0); : OutScalar(0);
outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
privateOffsetNC += Properties::LocalThreadSizeNC; privateOffsetNC += Properties::LocalThreadSizeNC;
} }
privateOffsetC += Properties::LocalThreadSizeC; privateOffsetC += Properties::LocalThreadSizeC;
@ -1263,7 +1263,7 @@ struct GeneralScalarContraction {
StorageIndex localid = itemID.get_local_id(0); StorageIndex localid = itemID.get_local_id(0);
OutScalar accumulator = OutScalar(0); OutScalar accumulator = OutScalar(0);
for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
} }
auto out_scratch_ptr = scratch_ptr + localid; auto out_scratch_ptr = scratch_ptr + localid;
*out_scratch_ptr = accumulator; *out_scratch_ptr = accumulator;

View File

@ -151,6 +151,7 @@ if(EIGEN_TEST_SYCL)
ei_add_test(cxx11_tensor_argmax_sycl) ei_add_test(cxx11_tensor_argmax_sycl)
ei_add_test(cxx11_tensor_custom_op_sycl) ei_add_test(cxx11_tensor_custom_op_sycl)
ei_add_test(cxx11_tensor_scan_sycl) ei_add_test(cxx11_tensor_scan_sycl)
ei_add_test(cxx11_tensor_of_float16_sycl)
set(EIGEN_SYCL OFF) set(EIGEN_SYCL OFF)
endif() endif()

View File

@ -32,9 +32,9 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
Tensor<DenseIndex, 0, Layout, DenseIndex> out_max; Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
Tensor<DenseIndex, 0, Layout, DenseIndex> out_min; Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
in.setRandom(); in.setRandom();
in *= in.constant(100.0); in *= in.constant(static_cast<DataType>(100.0));
in(0, 0, 0) = -1000.0; in(0, 0, 0) = static_cast<DataType>(-1000.0);
in(1, 1, 1) = 1000.0; in(1, 1, 1) = static_cast<DataType>(1000.0);
std::size_t in_bytes = in.size() * sizeof(DataType); std::size_t in_bytes = in.size() * sizeof(DataType);
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
@ -93,7 +93,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
ix[3] = l; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
// = 10.0 // = 10.0
tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0; tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
} }
} }
} }
@ -132,7 +132,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
ix[2] = k; ix[2] = k;
ix[3] = l; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0; tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
} }
} }
} }
@ -180,7 +180,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
ix[2] = k; ix[2] = k;
ix[3] = l; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0 // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0; tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
} }
} }
} }
@ -219,7 +219,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
ix[2] = k; ix[2] = k;
ix[3] = l; ix[3] = l;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0 // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0; tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
} }
} }
} }
@ -252,6 +252,7 @@ void sycl_argmax_test_per_device(const Device_Selector& d) {
EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
for (const auto& device : Eigen::get_sycl_supported_devices()) { for (const auto& device : Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
CALL_SUBTEST(sycl_argmax_test_per_device<float>(device)); CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
} }
} }

View File

@ -139,6 +139,7 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device)); CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
} }
} }

View File

@ -28,26 +28,135 @@ using Eigen::TensorMap;
// Functions used to compare the TensorMap implementation on the device with // Functions used to compare the TensorMap implementation on the device with
// the equivalent on the host // the equivalent on the host
namespace SYCL { namespace SYCL {
template <typename T> T abs(T x) {
template <typename T> T abs(T x) {
return cl::sycl::abs(x); return cl::sycl::abs(x);
} }
template <> Eigen::half abs(Eigen::half x) {
return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
}
template <> float abs(float x) { template <> float abs(float x) {
return cl::sycl::fabs(x); return cl::sycl::fabs(x);
} }
template <> double abs(double x) { template <> double abs(double x) {
return cl::sycl::fabs(x); return cl::sycl::fabs(x);
} }
template <typename T> T square(T x) { return x * x; } template <typename T> T square(T x) { return x * x; }
template <typename T> T cube(T x) { return x * x * x; } template <typename T> T cube(T x) { return x * x * x; }
template <typename T> T inverse(T x) { return T(1) / x; } template <typename T> T inverse(T x) { return T(1) / x; }
template <typename T> T cwiseMax(T x, T y) { template <typename T> T cwiseMax(T x, T y) {
return cl::sycl::max(x, y); return cl::sycl::max(x, y);
} }
template <typename T> T cwiseMin(T x, T y) { template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
return cl::sycl::min(x, y); return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
}
template <typename T> T cwiseMin(T x, T y) {
return cl::sycl::min(x, y);
}
template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
}
template <typename T> T sqrt(T x) {
return cl::sycl::sqrt(x);
}
template <> Eigen::half sqrt(Eigen::half x) {
return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
}
template <typename T> T rsqrt(T x) {
return cl::sycl::rsqrt(x);
}
template <> Eigen::half rsqrt(Eigen::half x) {
return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
}
template <typename T> T tanh(T x) {
return cl::sycl::tanh(x);
}
template <> Eigen::half tanh(Eigen::half x) {
return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
}
template <typename T> T exp(T x) {
return cl::sycl::exp(x);
}
template <> Eigen::half exp(Eigen::half x) {
return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
}
template <typename T> T expm1(T x) {
return cl::sycl::expm1(x);
}
template <> Eigen::half expm1(Eigen::half x) {
return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
}
template <typename T> T log(T x) {
return cl::sycl::log(x);
}
template <> Eigen::half log(Eigen::half x) {
return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
}
template <typename T> T ceil(T x) {
return cl::sycl::ceil(x);
}
template <> Eigen::half ceil(Eigen::half x) {
return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
}
template <typename T> T floor(T x) {
return cl::sycl::floor(x);
}
template <> Eigen::half floor(Eigen::half x) {
return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
}
template <typename T> T round(T x) {
return cl::sycl::round(x);
}
template <> Eigen::half round(Eigen::half x) {
return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
}
template <typename T> T log1p(T x) {
return cl::sycl::log1p(x);
}
template <> Eigen::half log1p(Eigen::half x) {
return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
}
template <typename T> T sign(T x) {
return cl::sycl::sign(x);
}
template <> Eigen::half sign(Eigen::half x) {
return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
}
template <typename T> T isnan(T x) {
return cl::sycl::isnan(x);
}
template <> Eigen::half isnan(Eigen::half x) {
return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
}
template <typename T> T isfinite(T x) {
return cl::sycl::isfinite(x);
}
template <> Eigen::half isfinite(Eigen::half x) {
return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
}
template <typename T> T isinf(T x) {
return cl::sycl::isinf(x);
}
template <> Eigen::half isinf(Eigen::half x) {
return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
} }
} }
@ -157,15 +266,14 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
#define DECLARE_UNARY_STRUCT(FUNC) \ #define DECLARE_UNARY_STRUCT(FUNC) \
struct op_##FUNC { \ struct op_##FUNC { \
template <typename T> \ template <typename T> \
auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) { \
return cl::sycl::FUNC(x); \ return SYCL::FUNC(x); \
} \ } \
template <typename T> \ template <typename T> \
auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \ auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
return x.FUNC(); \ return x.FUNC(); \
} \ } \
}; };
DECLARE_UNARY_STRUCT(sqrt) DECLARE_UNARY_STRUCT(sqrt)
DECLARE_UNARY_STRUCT(rsqrt) DECLARE_UNARY_STRUCT(rsqrt)
@ -390,8 +498,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
QueueInterface queueInterface(device); QueueInterface queueInterface(device);
Eigen::SyclDevice sycl_device(&queueInterface); Eigen::SyclDevice sycl_device(&queueInterface);
CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device)); CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device)); CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device)); CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
} }
} }

View File

@ -619,5 +619,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_chipping_test_per_device<float>(device)); CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
} }
} }

View File

@ -175,6 +175,7 @@ template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(
} }
EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(tensorConcat_perDevice<half>(device));
CALL_SUBTEST(tensorConcat_perDevice<float>(device)); CALL_SUBTEST(tensorConcat_perDevice<float>(device));
} }
} }

View File

@ -82,6 +82,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_device_test_per_device<half>(device));
CALL_SUBTEST(sycl_device_test_per_device<float>(device)); CALL_SUBTEST(sycl_device_test_per_device<float>(device));
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device)); CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
} }

View File

@ -95,6 +95,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST(sycl_computing_test_per_device<double>(device)); CALL_SUBTEST(sycl_computing_test_per_device<double>(device));

View File

@ -131,6 +131,7 @@ template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_
EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
CALL_SUBTEST(sycl_inflation_test_per_device<float>(device)); CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
} }
} }

View File

@ -121,6 +121,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_
EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device)); CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
} }
} }

View File

@ -100,6 +100,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
} }
} }

View File

@ -381,6 +381,7 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
CALL_SUBTEST(sycl_morphing_test_per_device<float>(device)); CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
} }
} }

View File

@ -0,0 +1,406 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2023
// Alejandro Acosta Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
// Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#define EIGEN_SYCL_HALF_SUPPORT
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::SyclDevice;
void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
Tensor<bool, 1> half_prec(num_elem);
Tensor<bool, 1> full_prec(num_elem);
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking numext " << i << std::endl;
VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
}
}
void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
d_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
d_conv, num_elem);
gpu_float.device(sycl_device) = gpu_float.random();
gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
gpu_conv.device(sycl_device) = gpu_half.cast<float>();
Tensor<float, 1> initial(num_elem);
Tensor<float, 1> final(num_elem);
sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
for (int i = 0; i < num_elem; ++i) {
VERIFY_IS_APPROX(initial(i), final(i));
}
}
void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(sycl_device) = gpu_float.abs();
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
Tensor<float, 1> half_prec(num_elem);
Tensor<float, 1> full_prec(num_elem);
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
sycl_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking unary " << i << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
}
}
void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
gpu_float1.device(sycl_device) = gpu_float1.random();
gpu_float2.device(sycl_device) = gpu_float2.random();
gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
Tensor<float, 1> half_prec(num_elem);
Tensor<float, 1> full_prec(num_elem);
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
}
}
void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
gpu_float3.device(sycl_device) = gpu_float3.random();
gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
Tensor<Eigen::half, 1> full_prec1(num_elem);
Tensor<float, 1> input2(num_elem);
Tensor<Eigen::half, 1> half_prec2(num_elem);
Tensor<Eigen::half, 1> full_prec2(num_elem);
Tensor<float, 1> input3(num_elem);
Tensor<Eigen::half, 1> half_prec3(num_elem);
Tensor<Eigen::half, 1> full_prec3(num_elem);
sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
sycl_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
else
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
}
}
void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
int rows = 23;
int cols = 23;
int num_elem = rows*cols;
float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
d_float1, rows, cols);
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
d_float2, rows, cols);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
d_res_half, rows, cols);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
d_res_float, rows, cols);
gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
typedef typename Tensor<float, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
Tensor<Eigen::half, 2> half_prec(rows, cols);
Tensor<Eigen::half, 2> full_prec(rows, cols);
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
sycl_device.synchronize();
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
}
}
}
}
void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2
<< " tensor along dim " << redux << std::endl;
int num_elem = size1*size2;
int result_size = (redux == 1 ? size1 : size2);
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
d_float, size1, size2);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, result_size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, result_size);
gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
Eigen::array<int, 1> redux_dim = {redux};
gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
Tensor<Eigen::half, 1> half_prec(result_size);
Tensor<Eigen::half, 1> full_prec(result_size);
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
sycl_device.synchronize();
for (int i = 0; i < result_size; ++i) {
std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
}
}
void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
test_gpu_reductions(sycl_device, 13, 13, 0);
test_gpu_reductions(sycl_device, 13, 13, 1);
test_gpu_reductions(sycl_device, 35, 36, 0);
test_gpu_reductions(sycl_device, 35, 36, 1);
test_gpu_reductions(sycl_device, 36, 35, 0);
test_gpu_reductions(sycl_device, 36, 35, 1);
}
void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
int size = 13;
int num_elem = size*size;
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
d_float, size, size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
d_res_half);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
d_res_float);
gpu_float.device(sycl_device) = gpu_float.random();
gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
Tensor<Eigen::half, 0> half_prec;
Tensor<Eigen::half, 0> full_prec;
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
sycl_device.synchronize();
VERIFY_IS_APPROX(full_prec(), half_prec());
gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
sycl_device.synchronize();
VERIFY_IS_APPROX(full_prec(), half_prec());
}
void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
int num_elem = 101;
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
d_res_half1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
d_res_half2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
Eigen::array<int, 1> no_bcast;
no_bcast[0] = 1;
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(sycl_device) = gpu_float.abs();
gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
Tensor<float, 1> half_prec1(num_elem);
Tensor<float, 1> half_prec2(num_elem);
Tensor<float, 1> full_prec(num_elem);
sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
sycl_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
}
}
EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
{
for (const auto& s : Eigen::get_sycl_supported_devices()) {
QueueInterface queueInterface(s);
auto sycl_device = Eigen::SyclDevice(&queueInterface);
CALL_SUBTEST_1(test_gpu_numext(sycl_device));
CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
CALL_SUBTEST_1(test_gpu_unary(sycl_device));
CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
}
}

View File

@ -152,6 +152,7 @@ template<typename DataType, typename dev_Selector> void sycl_padding_test_per_de
EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
CALL_SUBTEST(sycl_padding_test_per_device<float>(device)); CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
} }
} }

View File

@ -244,6 +244,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_p
EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device)); CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
} }
} }

View File

@ -78,6 +78,7 @@ template<typename DataType, typename dev_Selector> void sycl_random_test_per_dev
EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_random_test_per_device<half>(device));
CALL_SUBTEST(sycl_random_test_per_device<float>(device)); CALL_SUBTEST(sycl_random_test_per_device<float>(device));
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST(sycl_random_test_per_device<double>(device)); CALL_SUBTEST(sycl_random_test_per_device<double>(device));

View File

@ -248,6 +248,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device)); CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
#endif #endif
CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device)); CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
} }
} }

View File

@ -112,6 +112,7 @@ void sycl_shuffling_test_per_device(dev_Selector s) {
} }
EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
for (const auto& device : Eigen::get_sycl_supported_devices()) { for (const auto& device : Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device)); CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
} }
} }

View File

@ -217,6 +217,7 @@ test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl) EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
{ {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device)); CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
} }
} }