mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-30 16:52:01 +08:00
[SYCL-2020] Enabling half precision support for SYCL.
This commit is contained in:
parent
92a77a596b
commit
ba47341a14
@ -86,6 +86,8 @@ struct sycl_packet_traits : default_packet_traits {
|
|||||||
typedef packet_type half; \
|
typedef packet_type half; \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
|
||||||
|
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
|
||||||
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
|
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
|
||||||
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
|
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
|
||||||
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
|
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
|
||||||
@ -100,6 +102,7 @@ SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
|
|||||||
struct is_arithmetic<packet_type> { \
|
struct is_arithmetic<packet_type> { \
|
||||||
enum { value = true }; \
|
enum { value = true }; \
|
||||||
};
|
};
|
||||||
|
SYCL_ARITHMETIC(cl::sycl::cl_half8)
|
||||||
SYCL_ARITHMETIC(cl::sycl::cl_float4)
|
SYCL_ARITHMETIC(cl::sycl::cl_float4)
|
||||||
SYCL_ARITHMETIC(cl::sycl::cl_double2)
|
SYCL_ARITHMETIC(cl::sycl::cl_double2)
|
||||||
#undef SYCL_ARITHMETIC
|
#undef SYCL_ARITHMETIC
|
||||||
@ -111,6 +114,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
|
|||||||
enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
|
enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
|
||||||
typedef packet_type half; \
|
typedef packet_type half; \
|
||||||
};
|
};
|
||||||
|
SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
|
||||||
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
|
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
|
||||||
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
|
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ namespace internal {
|
|||||||
return cl::sycl::log(a); \
|
return cl::sycl::log(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PLOG(cl::sycl::cl_half8)
|
||||||
SYCL_PLOG(cl::sycl::cl_float4)
|
SYCL_PLOG(cl::sycl::cl_float4)
|
||||||
SYCL_PLOG(cl::sycl::cl_double2)
|
SYCL_PLOG(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PLOG
|
#undef SYCL_PLOG
|
||||||
@ -49,6 +50,7 @@ SYCL_PLOG(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::log1p(a); \
|
return cl::sycl::log1p(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PLOG1P(cl::sycl::cl_half8)
|
||||||
SYCL_PLOG1P(cl::sycl::cl_float4)
|
SYCL_PLOG1P(cl::sycl::cl_float4)
|
||||||
SYCL_PLOG1P(cl::sycl::cl_double2)
|
SYCL_PLOG1P(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PLOG1P
|
#undef SYCL_PLOG1P
|
||||||
@ -60,6 +62,7 @@ SYCL_PLOG1P(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::log10(a); \
|
return cl::sycl::log10(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PLOG10(cl::sycl::cl_half8)
|
||||||
SYCL_PLOG10(cl::sycl::cl_float4)
|
SYCL_PLOG10(cl::sycl::cl_float4)
|
||||||
SYCL_PLOG10(cl::sycl::cl_double2)
|
SYCL_PLOG10(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PLOG10
|
#undef SYCL_PLOG10
|
||||||
@ -71,6 +74,8 @@ SYCL_PLOG10(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::exp(a); \
|
return cl::sycl::exp(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PEXP(cl::sycl::cl_half8)
|
||||||
|
SYCL_PEXP(cl::sycl::cl_half)
|
||||||
SYCL_PEXP(cl::sycl::cl_float4)
|
SYCL_PEXP(cl::sycl::cl_float4)
|
||||||
SYCL_PEXP(cl::sycl::cl_float)
|
SYCL_PEXP(cl::sycl::cl_float)
|
||||||
SYCL_PEXP(cl::sycl::cl_double2)
|
SYCL_PEXP(cl::sycl::cl_double2)
|
||||||
@ -83,6 +88,7 @@ SYCL_PEXP(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::expm1(a); \
|
return cl::sycl::expm1(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PEXPM1(cl::sycl::cl_half8)
|
||||||
SYCL_PEXPM1(cl::sycl::cl_float4)
|
SYCL_PEXPM1(cl::sycl::cl_float4)
|
||||||
SYCL_PEXPM1(cl::sycl::cl_double2)
|
SYCL_PEXPM1(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PEXPM1
|
#undef SYCL_PEXPM1
|
||||||
@ -94,6 +100,7 @@ SYCL_PEXPM1(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::sqrt(a); \
|
return cl::sycl::sqrt(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PSQRT(cl::sycl::cl_half8)
|
||||||
SYCL_PSQRT(cl::sycl::cl_float4)
|
SYCL_PSQRT(cl::sycl::cl_float4)
|
||||||
SYCL_PSQRT(cl::sycl::cl_double2)
|
SYCL_PSQRT(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PSQRT
|
#undef SYCL_PSQRT
|
||||||
@ -105,6 +112,7 @@ SYCL_PSQRT(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::rsqrt(a); \
|
return cl::sycl::rsqrt(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PRSQRT(cl::sycl::cl_half8)
|
||||||
SYCL_PRSQRT(cl::sycl::cl_float4)
|
SYCL_PRSQRT(cl::sycl::cl_float4)
|
||||||
SYCL_PRSQRT(cl::sycl::cl_double2)
|
SYCL_PRSQRT(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PRSQRT
|
#undef SYCL_PRSQRT
|
||||||
@ -117,6 +125,7 @@ SYCL_PRSQRT(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::sin(a); \
|
return cl::sycl::sin(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PSIN(cl::sycl::cl_half8)
|
||||||
SYCL_PSIN(cl::sycl::cl_float4)
|
SYCL_PSIN(cl::sycl::cl_float4)
|
||||||
SYCL_PSIN(cl::sycl::cl_double2)
|
SYCL_PSIN(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PSIN
|
#undef SYCL_PSIN
|
||||||
@ -129,6 +138,7 @@ SYCL_PSIN(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::cos(a); \
|
return cl::sycl::cos(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PCOS(cl::sycl::cl_half8)
|
||||||
SYCL_PCOS(cl::sycl::cl_float4)
|
SYCL_PCOS(cl::sycl::cl_float4)
|
||||||
SYCL_PCOS(cl::sycl::cl_double2)
|
SYCL_PCOS(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PCOS
|
#undef SYCL_PCOS
|
||||||
@ -141,6 +151,7 @@ SYCL_PCOS(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::tan(a); \
|
return cl::sycl::tan(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PTAN(cl::sycl::cl_half8)
|
||||||
SYCL_PTAN(cl::sycl::cl_float4)
|
SYCL_PTAN(cl::sycl::cl_float4)
|
||||||
SYCL_PTAN(cl::sycl::cl_double2)
|
SYCL_PTAN(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PTAN
|
#undef SYCL_PTAN
|
||||||
@ -153,6 +164,7 @@ SYCL_PTAN(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::asin(a); \
|
return cl::sycl::asin(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PASIN(cl::sycl::cl_half8)
|
||||||
SYCL_PASIN(cl::sycl::cl_float4)
|
SYCL_PASIN(cl::sycl::cl_float4)
|
||||||
SYCL_PASIN(cl::sycl::cl_double2)
|
SYCL_PASIN(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PASIN
|
#undef SYCL_PASIN
|
||||||
@ -165,6 +177,7 @@ SYCL_PASIN(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::acos(a); \
|
return cl::sycl::acos(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PACOS(cl::sycl::cl_half8)
|
||||||
SYCL_PACOS(cl::sycl::cl_float4)
|
SYCL_PACOS(cl::sycl::cl_float4)
|
||||||
SYCL_PACOS(cl::sycl::cl_double2)
|
SYCL_PACOS(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PACOS
|
#undef SYCL_PACOS
|
||||||
@ -177,6 +190,7 @@ SYCL_PACOS(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::atan(a); \
|
return cl::sycl::atan(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PATAN(cl::sycl::cl_half8)
|
||||||
SYCL_PATAN(cl::sycl::cl_float4)
|
SYCL_PATAN(cl::sycl::cl_float4)
|
||||||
SYCL_PATAN(cl::sycl::cl_double2)
|
SYCL_PATAN(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PATAN
|
#undef SYCL_PATAN
|
||||||
@ -189,6 +203,7 @@ SYCL_PATAN(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::sinh(a); \
|
return cl::sycl::sinh(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PSINH(cl::sycl::cl_half8)
|
||||||
SYCL_PSINH(cl::sycl::cl_float4)
|
SYCL_PSINH(cl::sycl::cl_float4)
|
||||||
SYCL_PSINH(cl::sycl::cl_double2)
|
SYCL_PSINH(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PSINH
|
#undef SYCL_PSINH
|
||||||
@ -201,6 +216,7 @@ SYCL_PSINH(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::cosh(a); \
|
return cl::sycl::cosh(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PCOSH(cl::sycl::cl_half8)
|
||||||
SYCL_PCOSH(cl::sycl::cl_float4)
|
SYCL_PCOSH(cl::sycl::cl_float4)
|
||||||
SYCL_PCOSH(cl::sycl::cl_double2)
|
SYCL_PCOSH(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PCOSH
|
#undef SYCL_PCOSH
|
||||||
@ -213,6 +229,7 @@ SYCL_PCOSH(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::tanh(a); \
|
return cl::sycl::tanh(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PTANH(cl::sycl::cl_half8)
|
||||||
SYCL_PTANH(cl::sycl::cl_float4)
|
SYCL_PTANH(cl::sycl::cl_float4)
|
||||||
SYCL_PTANH(cl::sycl::cl_double2)
|
SYCL_PTANH(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PTANH
|
#undef SYCL_PTANH
|
||||||
@ -224,6 +241,7 @@ SYCL_PTANH(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::ceil(a); \
|
return cl::sycl::ceil(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PCEIL(cl::sycl::cl_half)
|
||||||
SYCL_PCEIL(cl::sycl::cl_float4)
|
SYCL_PCEIL(cl::sycl::cl_float4)
|
||||||
SYCL_PCEIL(cl::sycl::cl_double2)
|
SYCL_PCEIL(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PCEIL
|
#undef SYCL_PCEIL
|
||||||
@ -235,6 +253,7 @@ SYCL_PCEIL(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::round(a); \
|
return cl::sycl::round(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PROUND(cl::sycl::cl_half8)
|
||||||
SYCL_PROUND(cl::sycl::cl_float4)
|
SYCL_PROUND(cl::sycl::cl_float4)
|
||||||
SYCL_PROUND(cl::sycl::cl_double2)
|
SYCL_PROUND(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PROUND
|
#undef SYCL_PROUND
|
||||||
@ -246,6 +265,7 @@ SYCL_PROUND(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::rint(a); \
|
return cl::sycl::rint(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PRINT(cl::sycl::cl_half8)
|
||||||
SYCL_PRINT(cl::sycl::cl_float4)
|
SYCL_PRINT(cl::sycl::cl_float4)
|
||||||
SYCL_PRINT(cl::sycl::cl_double2)
|
SYCL_PRINT(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PRINT
|
#undef SYCL_PRINT
|
||||||
@ -257,6 +277,7 @@ SYCL_PRINT(cl::sycl::cl_double2)
|
|||||||
return cl::sycl::floor(a); \
|
return cl::sycl::floor(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_FLOOR(cl::sycl::cl_half8)
|
||||||
SYCL_FLOOR(cl::sycl::cl_float4)
|
SYCL_FLOOR(cl::sycl::cl_float4)
|
||||||
SYCL_FLOOR(cl::sycl::cl_double2)
|
SYCL_FLOOR(cl::sycl::cl_double2)
|
||||||
#undef SYCL_FLOOR
|
#undef SYCL_FLOOR
|
||||||
@ -268,6 +289,7 @@ SYCL_FLOOR(cl::sycl::cl_double2)
|
|||||||
return expr; \
|
return expr; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
|
||||||
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
|
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
|
||||||
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
|
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
|
||||||
#undef SYCL_PMIN
|
#undef SYCL_PMIN
|
||||||
@ -279,6 +301,7 @@ SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
|
|||||||
return expr; \
|
return expr; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
|
||||||
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
|
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
|
||||||
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
|
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
|
||||||
#undef SYCL_PMAX
|
#undef SYCL_PMAX
|
||||||
@ -292,6 +315,7 @@ SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
|
|||||||
cl::sycl::rounding_mode::automatic>()); \
|
cl::sycl::rounding_mode::automatic>()); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PLDEXP(cl::sycl::cl_half8)
|
||||||
SYCL_PLDEXP(cl::sycl::cl_float4)
|
SYCL_PLDEXP(cl::sycl::cl_float4)
|
||||||
SYCL_PLDEXP(cl::sycl::cl_double2)
|
SYCL_PLDEXP(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PLDEXP
|
#undef SYCL_PLDEXP
|
||||||
|
@ -44,9 +44,34 @@ SYCL_PLOAD(cl::sycl::cl_float4, u)
|
|||||||
SYCL_PLOAD(cl::sycl::cl_float4, )
|
SYCL_PLOAD(cl::sycl::cl_float4, )
|
||||||
SYCL_PLOAD(cl::sycl::cl_double2, u)
|
SYCL_PLOAD(cl::sycl::cl_double2, u)
|
||||||
SYCL_PLOAD(cl::sycl::cl_double2, )
|
SYCL_PLOAD(cl::sycl::cl_double2, )
|
||||||
|
|
||||||
#undef SYCL_PLOAD
|
#undef SYCL_PLOAD
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
|
||||||
|
pload<cl::sycl::cl_half8>(
|
||||||
|
const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
|
||||||
|
auto ptr = cl::sycl::address_space_cast<
|
||||||
|
cl::sycl::access::address_space::generic_space,
|
||||||
|
cl::sycl::access::decorated::no>(
|
||||||
|
reinterpret_cast<const cl::sycl::cl_half*>(from));
|
||||||
|
cl::sycl::cl_half8 res{};
|
||||||
|
res.load(0, ptr);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
|
||||||
|
ploadu<cl::sycl::cl_half8>(
|
||||||
|
const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
|
||||||
|
auto ptr = cl::sycl::address_space_cast<
|
||||||
|
cl::sycl::access::address_space::generic_space,
|
||||||
|
cl::sycl::access::decorated::no>(
|
||||||
|
reinterpret_cast<const cl::sycl::cl_half*>(from));
|
||||||
|
cl::sycl::cl_half8 res{};
|
||||||
|
res.load(0, ptr);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
#define SYCL_PSTORE(scalar, packet_type, alignment) \
|
#define SYCL_PSTORE(scalar, packet_type, alignment) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||||
@ -59,9 +84,28 @@ SYCL_PSTORE(float, cl::sycl::cl_float4, )
|
|||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
|
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, )
|
SYCL_PSTORE(double, cl::sycl::cl_double2, )
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
|
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
|
||||||
|
|
||||||
#undef SYCL_PSTORE
|
#undef SYCL_PSTORE
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
|
||||||
|
Eigen::half* to, const cl::sycl::cl_half8& from) {
|
||||||
|
auto ptr = cl::sycl::address_space_cast<
|
||||||
|
cl::sycl::access::address_space::generic_space,
|
||||||
|
cl::sycl::access::decorated::no>(
|
||||||
|
reinterpret_cast<cl::sycl::cl_half*>(to));
|
||||||
|
from.store(0, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
|
||||||
|
Eigen::half* to, const cl::sycl::cl_half8& from) {
|
||||||
|
auto ptr = cl::sycl::address_space_cast<
|
||||||
|
cl::sycl::access::address_space::generic_space,
|
||||||
|
cl::sycl::access::decorated::no>(
|
||||||
|
reinterpret_cast<cl::sycl::cl_half*>(to));
|
||||||
|
from.store(0, ptr);
|
||||||
|
}
|
||||||
|
|
||||||
#define SYCL_PSET1(packet_type) \
|
#define SYCL_PSET1(packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
|
||||||
@ -70,6 +114,7 @@ SYCL_PSTORE(double, cl::sycl::cl_double2, u)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// global space
|
// global space
|
||||||
|
SYCL_PSET1(cl::sycl::cl_half8)
|
||||||
SYCL_PSET1(cl::sycl::cl_float4)
|
SYCL_PSET1(cl::sycl::cl_float4)
|
||||||
SYCL_PSET1(cl::sycl::cl_double2)
|
SYCL_PSET1(cl::sycl::cl_double2)
|
||||||
|
|
||||||
@ -86,6 +131,58 @@ struct get_base_packet {
|
|||||||
get_pgather(sycl_multi_pointer, Index) {}
|
get_pgather(sycl_multi_pointer, Index) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct get_base_packet<cl::sycl::cl_half8> {
|
||||||
|
template <typename sycl_multi_pointer>
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
|
||||||
|
sycl_multi_pointer from) {
|
||||||
|
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
|
||||||
|
static_cast<cl::sycl::half>(from[0]),
|
||||||
|
static_cast<cl::sycl::half>(from[1]),
|
||||||
|
static_cast<cl::sycl::half>(from[1]),
|
||||||
|
static_cast<cl::sycl::half>(from[2]),
|
||||||
|
static_cast<cl::sycl::half>(from[2]),
|
||||||
|
static_cast<cl::sycl::half>(from[3]),
|
||||||
|
static_cast<cl::sycl::half>(from[3]));
|
||||||
|
}
|
||||||
|
template <typename sycl_multi_pointer>
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
|
||||||
|
sycl_multi_pointer from, Index stride) {
|
||||||
|
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[1 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[2 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[3 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[4 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[5 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[6 * stride]),
|
||||||
|
static_cast<cl::sycl::half>(from[7 * stride]));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename sycl_multi_pointer>
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
|
||||||
|
sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
|
||||||
|
auto tmp = stride;
|
||||||
|
to[0] = Eigen::half(from.s0());
|
||||||
|
to[tmp] = Eigen::half(from.s1());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s2());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s3());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s4());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s5());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s6());
|
||||||
|
to[tmp += stride] = Eigen::half(from.s7());
|
||||||
|
}
|
||||||
|
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
|
||||||
|
const cl::sycl::half& a) {
|
||||||
|
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
|
||||||
|
static_cast<cl::sycl::half>(a + 2),
|
||||||
|
static_cast<cl::sycl::half>(a + 3),
|
||||||
|
static_cast<cl::sycl::half>(a + 4),
|
||||||
|
static_cast<cl::sycl::half>(a + 5),
|
||||||
|
static_cast<cl::sycl::half>(a + 6),
|
||||||
|
static_cast<cl::sycl::half>(a + 7));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct get_base_packet<cl::sycl::cl_float4> {
|
struct get_base_packet<cl::sycl::cl_float4> {
|
||||||
template <typename sycl_multi_pointer>
|
template <typename sycl_multi_pointer>
|
||||||
@ -152,6 +249,7 @@ struct get_base_packet<cl::sycl::cl_double2> {
|
|||||||
return get_base_packet<packet_type>::get_ploaddup(from); \
|
return get_base_packet<packet_type>::get_ploaddup(from); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
|
||||||
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
|
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
|
||||||
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
||||||
|
|
||||||
@ -165,9 +263,14 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
|||||||
}
|
}
|
||||||
SYCL_PLSET(cl::sycl::cl_float4)
|
SYCL_PLSET(cl::sycl::cl_float4)
|
||||||
SYCL_PLSET(cl::sycl::cl_double2)
|
SYCL_PLSET(cl::sycl::cl_double2)
|
||||||
|
|
||||||
#undef SYCL_PLSET
|
#undef SYCL_PLSET
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
|
||||||
|
const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
|
||||||
|
return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
|
||||||
|
}
|
||||||
|
|
||||||
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
|
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||||
@ -176,9 +279,9 @@ SYCL_PLSET(cl::sycl::cl_double2)
|
|||||||
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
|
||||||
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
|
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
|
||||||
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
||||||
|
|
||||||
#undef SYCL_PGATHER_SPECILIZE
|
#undef SYCL_PGATHER_SPECILIZE
|
||||||
|
|
||||||
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
|
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
|
||||||
@ -189,6 +292,7 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
|||||||
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
|
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
|
||||||
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
|
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
|
||||||
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
|
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
|
||||||
|
|
||||||
@ -201,10 +305,16 @@ SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
|
|||||||
return cl::sycl::mad(a, b, c); \
|
return cl::sycl::mad(a, b, c); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PMAD(cl::sycl::cl_half8)
|
||||||
SYCL_PMAD(cl::sycl::cl_float4)
|
SYCL_PMAD(cl::sycl::cl_float4)
|
||||||
SYCL_PMAD(cl::sycl::cl_double2)
|
SYCL_PMAD(cl::sycl::cl_double2)
|
||||||
#undef SYCL_PMAD
|
#undef SYCL_PMAD
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
|
||||||
|
const cl::sycl::cl_half8& a) {
|
||||||
|
return Eigen::half(a.s0());
|
||||||
|
}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
|
||||||
const cl::sycl::cl_float4& a) {
|
const cl::sycl::cl_float4& a) {
|
||||||
@ -216,6 +326,13 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
|
|||||||
return a.x();
|
return a.x();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
|
||||||
|
const cl::sycl::cl_half8& a) {
|
||||||
|
return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
|
||||||
|
+ a.s6() + a.s7());
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
|
||||||
const cl::sycl::cl_float4& a) {
|
const cl::sycl::cl_float4& a) {
|
||||||
@ -228,6 +345,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
|
|||||||
return a.x() + a.y();
|
return a.x() + a.y();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
|
||||||
|
const cl::sycl::cl_half8& a) {
|
||||||
|
return Eigen::half(cl::sycl::fmax(
|
||||||
|
cl::sycl::fmax(
|
||||||
|
cl::sycl::fmax(a.s0(), a.s1()),
|
||||||
|
cl::sycl::fmax(a.s2(), a.s3())),
|
||||||
|
cl::sycl::fmax(
|
||||||
|
cl::sycl::fmax(a.s4(), a.s5()),
|
||||||
|
cl::sycl::fmax(a.s6(), a.s7()))));
|
||||||
|
}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
|
||||||
const cl::sycl::cl_float4& a) {
|
const cl::sycl::cl_float4& a) {
|
||||||
@ -240,6 +368,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
|
|||||||
return cl::sycl::fmax(a.x(), a.y());
|
return cl::sycl::fmax(a.x(), a.y());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
|
||||||
|
const cl::sycl::cl_half8& a) {
|
||||||
|
return Eigen::half(cl::sycl::fmin(
|
||||||
|
cl::sycl::fmin(
|
||||||
|
cl::sycl::fmin(a.s0(), a.s1()),
|
||||||
|
cl::sycl::fmin(a.s2(), a.s3())),
|
||||||
|
cl::sycl::fmin(
|
||||||
|
cl::sycl::fmin(a.s4(), a.s5()),
|
||||||
|
cl::sycl::fmin(a.s6(), a.s7()))));
|
||||||
|
}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
|
||||||
const cl::sycl::cl_float4& a) {
|
const cl::sycl::cl_float4& a) {
|
||||||
@ -252,6 +391,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
|
|||||||
return cl::sycl::fmin(a.x(), a.y());
|
return cl::sycl::fmin(a.x(), a.y());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
|
||||||
|
const cl::sycl::cl_half8& a) {
|
||||||
|
return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
|
||||||
|
a.s6() * a.s7());
|
||||||
|
}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
|
||||||
const cl::sycl::cl_float4& a) {
|
const cl::sycl::cl_float4& a) {
|
||||||
@ -263,6 +408,14 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
|
|||||||
return a.x() * a.y();
|
return a.x() * a.y();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
|
||||||
|
pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
|
||||||
|
return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
|
||||||
|
cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
|
||||||
|
cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
|
||||||
|
cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
|
||||||
|
}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
|
||||||
pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||||
@ -300,6 +453,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
|
|||||||
return sycl_pcmp_##OP<TYPE>(a, b); \
|
return sycl_pcmp_##OP<TYPE>(a, b); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SYCL_PCMP(le, cl::sycl::cl_half8)
|
||||||
|
SYCL_PCMP(lt, cl::sycl::cl_half8)
|
||||||
|
SYCL_PCMP(eq, cl::sycl::cl_half8)
|
||||||
SYCL_PCMP(le, cl::sycl::cl_float4)
|
SYCL_PCMP(le, cl::sycl::cl_float4)
|
||||||
SYCL_PCMP(lt, cl::sycl::cl_float4)
|
SYCL_PCMP(lt, cl::sycl::cl_float4)
|
||||||
SYCL_PCMP(eq, cl::sycl::cl_float4)
|
SYCL_PCMP(eq, cl::sycl::cl_float4)
|
||||||
@ -308,6 +464,121 @@ SYCL_PCMP(lt, cl::sycl::cl_double2)
|
|||||||
SYCL_PCMP(eq, cl::sycl::cl_double2)
|
SYCL_PCMP(eq, cl::sycl::cl_double2)
|
||||||
#undef SYCL_PCMP
|
#undef SYCL_PCMP
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
|
||||||
|
PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
|
||||||
|
cl::sycl::cl_half tmp = kernel.packet[0].s1();
|
||||||
|
kernel.packet[0].s1() = kernel.packet[1].s0();
|
||||||
|
kernel.packet[1].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s2();
|
||||||
|
kernel.packet[0].s2() = kernel.packet[2].s0();
|
||||||
|
kernel.packet[2].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s3();
|
||||||
|
kernel.packet[0].s3() = kernel.packet[3].s0();
|
||||||
|
kernel.packet[3].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s4();
|
||||||
|
kernel.packet[0].s4() = kernel.packet[4].s0();
|
||||||
|
kernel.packet[4].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s5();
|
||||||
|
kernel.packet[0].s5() = kernel.packet[5].s0();
|
||||||
|
kernel.packet[5].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s6();
|
||||||
|
kernel.packet[0].s6() = kernel.packet[6].s0();
|
||||||
|
kernel.packet[6].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[0].s7();
|
||||||
|
kernel.packet[0].s7() = kernel.packet[7].s0();
|
||||||
|
kernel.packet[7].s0() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s2();
|
||||||
|
kernel.packet[1].s2() = kernel.packet[2].s1();
|
||||||
|
kernel.packet[2].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s3();
|
||||||
|
kernel.packet[1].s3() = kernel.packet[3].s1();
|
||||||
|
kernel.packet[3].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s4();
|
||||||
|
kernel.packet[1].s4() = kernel.packet[4].s1();
|
||||||
|
kernel.packet[4].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s5();
|
||||||
|
kernel.packet[1].s5() = kernel.packet[5].s1();
|
||||||
|
kernel.packet[5].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s6();
|
||||||
|
kernel.packet[1].s6() = kernel.packet[6].s1();
|
||||||
|
kernel.packet[6].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[1].s7();
|
||||||
|
kernel.packet[1].s7() = kernel.packet[7].s1();
|
||||||
|
kernel.packet[7].s1() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[2].s3();
|
||||||
|
kernel.packet[2].s3() = kernel.packet[3].s2();
|
||||||
|
kernel.packet[3].s2() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[2].s4();
|
||||||
|
kernel.packet[2].s4() = kernel.packet[4].s2();
|
||||||
|
kernel.packet[4].s2() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[2].s5();
|
||||||
|
kernel.packet[2].s5() = kernel.packet[5].s2();
|
||||||
|
kernel.packet[5].s2() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[2].s6();
|
||||||
|
kernel.packet[2].s6() = kernel.packet[6].s2();
|
||||||
|
kernel.packet[6].s2() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[2].s7();
|
||||||
|
kernel.packet[2].s7() = kernel.packet[7].s2();
|
||||||
|
kernel.packet[7].s2() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[3].s4();
|
||||||
|
kernel.packet[3].s4() = kernel.packet[4].s3();
|
||||||
|
kernel.packet[4].s3() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[3].s5();
|
||||||
|
kernel.packet[3].s5() = kernel.packet[5].s3();
|
||||||
|
kernel.packet[5].s3() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[3].s6();
|
||||||
|
kernel.packet[3].s6() = kernel.packet[6].s3();
|
||||||
|
kernel.packet[6].s3() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[3].s7();
|
||||||
|
kernel.packet[3].s7() = kernel.packet[7].s3();
|
||||||
|
kernel.packet[7].s3() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[4].s5();
|
||||||
|
kernel.packet[4].s5() = kernel.packet[5].s4();
|
||||||
|
kernel.packet[5].s4() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[4].s6();
|
||||||
|
kernel.packet[4].s6() = kernel.packet[6].s4();
|
||||||
|
kernel.packet[6].s4() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[4].s7();
|
||||||
|
kernel.packet[4].s7() = kernel.packet[7].s4();
|
||||||
|
kernel.packet[7].s4() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[5].s6();
|
||||||
|
kernel.packet[5].s6() = kernel.packet[6].s5();
|
||||||
|
kernel.packet[6].s5() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[5].s7();
|
||||||
|
kernel.packet[5].s7() = kernel.packet[7].s5();
|
||||||
|
kernel.packet[7].s5() = tmp;
|
||||||
|
|
||||||
|
tmp = kernel.packet[6].s7();
|
||||||
|
kernel.packet[6].s7() = kernel.packet[7].s6();
|
||||||
|
kernel.packet[7].s6() = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
|
||||||
PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
|
PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
|
||||||
float tmp = kernel.packet[0].y();
|
float tmp = kernel.packet[0].y();
|
||||||
@ -342,6 +613,19 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
|
|||||||
kernel.packet[1].x() = tmp;
|
kernel.packet[1].x() = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
|
||||||
|
const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
|
||||||
|
const cl::sycl::cl_half8& thenPacket,
|
||||||
|
const cl::sycl::cl_half8& elsePacket) {
|
||||||
|
cl::sycl::cl_short8 condition(
|
||||||
|
ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
|
||||||
|
ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
|
||||||
|
ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
|
||||||
|
ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
|
||||||
|
return cl::sycl::select(thenPacket, elsePacket, condition);
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
|
||||||
const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
|
const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
|
||||||
|
@ -1118,7 +1118,7 @@ struct GeneralVectorTensor {
|
|||||||
: globalContractDimOffset + privateOffsetC)
|
: globalContractDimOffset + privateOffsetC)
|
||||||
: OutScalar(0);
|
: OutScalar(0);
|
||||||
|
|
||||||
outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
|
outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
|
||||||
privateOffsetNC += Properties::LocalThreadSizeNC;
|
privateOffsetNC += Properties::LocalThreadSizeNC;
|
||||||
}
|
}
|
||||||
privateOffsetC += Properties::LocalThreadSizeC;
|
privateOffsetC += Properties::LocalThreadSizeC;
|
||||||
@ -1263,7 +1263,7 @@ struct GeneralScalarContraction {
|
|||||||
StorageIndex localid = itemID.get_local_id(0);
|
StorageIndex localid = itemID.get_local_id(0);
|
||||||
OutScalar accumulator = OutScalar(0);
|
OutScalar accumulator = OutScalar(0);
|
||||||
for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
|
for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
|
||||||
accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
|
accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
|
||||||
}
|
}
|
||||||
auto out_scratch_ptr = scratch_ptr + localid;
|
auto out_scratch_ptr = scratch_ptr + localid;
|
||||||
*out_scratch_ptr = accumulator;
|
*out_scratch_ptr = accumulator;
|
||||||
|
@ -151,6 +151,7 @@ if(EIGEN_TEST_SYCL)
|
|||||||
ei_add_test(cxx11_tensor_argmax_sycl)
|
ei_add_test(cxx11_tensor_argmax_sycl)
|
||||||
ei_add_test(cxx11_tensor_custom_op_sycl)
|
ei_add_test(cxx11_tensor_custom_op_sycl)
|
||||||
ei_add_test(cxx11_tensor_scan_sycl)
|
ei_add_test(cxx11_tensor_scan_sycl)
|
||||||
|
ei_add_test(cxx11_tensor_of_float16_sycl)
|
||||||
set(EIGEN_SYCL OFF)
|
set(EIGEN_SYCL OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -32,9 +32,9 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
|
|||||||
Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
|
Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
|
||||||
Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
|
Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
|
||||||
in.setRandom();
|
in.setRandom();
|
||||||
in *= in.constant(100.0);
|
in *= in.constant(static_cast<DataType>(100.0));
|
||||||
in(0, 0, 0) = -1000.0;
|
in(0, 0, 0) = static_cast<DataType>(-1000.0);
|
||||||
in(1, 1, 1) = 1000.0;
|
in(1, 1, 1) = static_cast<DataType>(1000.0);
|
||||||
|
|
||||||
std::size_t in_bytes = in.size() * sizeof(DataType);
|
std::size_t in_bytes = in.size() * sizeof(DataType);
|
||||||
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
|
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
|
||||||
@ -93,7 +93,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
|
|||||||
ix[3] = l;
|
ix[3] = l;
|
||||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
|
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
|
||||||
// = 10.0
|
// = 10.0
|
||||||
tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
|
tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -132,7 +132,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
|
|||||||
ix[2] = k;
|
ix[2] = k;
|
||||||
ix[3] = l;
|
ix[3] = l;
|
||||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
|
||||||
tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
|
tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -180,7 +180,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
|
|||||||
ix[2] = k;
|
ix[2] = k;
|
||||||
ix[3] = l;
|
ix[3] = l;
|
||||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
|
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
|
||||||
tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
|
tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -219,7 +219,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
|
|||||||
ix[2] = k;
|
ix[2] = k;
|
||||||
ix[3] = l;
|
ix[3] = l;
|
||||||
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
|
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
|
||||||
tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
|
tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -252,6 +252,7 @@ void sycl_argmax_test_per_device(const Device_Selector& d) {
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
|
||||||
for (const auto& device : Eigen::get_sycl_supported_devices()) {
|
for (const auto& device : Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -139,6 +139,7 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,26 +28,135 @@ using Eigen::TensorMap;
|
|||||||
// Functions used to compare the TensorMap implementation on the device with
|
// Functions used to compare the TensorMap implementation on the device with
|
||||||
// the equivalent on the host
|
// the equivalent on the host
|
||||||
namespace SYCL {
|
namespace SYCL {
|
||||||
template <typename T> T abs(T x) {
|
|
||||||
|
template <typename T> T abs(T x) {
|
||||||
return cl::sycl::abs(x);
|
return cl::sycl::abs(x);
|
||||||
}
|
}
|
||||||
|
template <> Eigen::half abs(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
template <> float abs(float x) {
|
template <> float abs(float x) {
|
||||||
return cl::sycl::fabs(x);
|
return cl::sycl::fabs(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> double abs(double x) {
|
template <> double abs(double x) {
|
||||||
return cl::sycl::fabs(x);
|
return cl::sycl::fabs(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> T square(T x) { return x * x; }
|
template <typename T> T square(T x) { return x * x; }
|
||||||
template <typename T> T cube(T x) { return x * x * x; }
|
template <typename T> T cube(T x) { return x * x * x; }
|
||||||
template <typename T> T inverse(T x) { return T(1) / x; }
|
template <typename T> T inverse(T x) { return T(1) / x; }
|
||||||
template <typename T> T cwiseMax(T x, T y) {
|
template <typename T> T cwiseMax(T x, T y) {
|
||||||
return cl::sycl::max(x, y);
|
return cl::sycl::max(x, y);
|
||||||
}
|
}
|
||||||
template <typename T> T cwiseMin(T x, T y) {
|
template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
|
||||||
return cl::sycl::min(x, y);
|
return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T cwiseMin(T x, T y) {
|
||||||
|
return cl::sycl::min(x, y);
|
||||||
|
}
|
||||||
|
template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
|
||||||
|
return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T sqrt(T x) {
|
||||||
|
return cl::sycl::sqrt(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half sqrt(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T rsqrt(T x) {
|
||||||
|
return cl::sycl::rsqrt(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half rsqrt(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T tanh(T x) {
|
||||||
|
return cl::sycl::tanh(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half tanh(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T exp(T x) {
|
||||||
|
return cl::sycl::exp(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half exp(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T expm1(T x) {
|
||||||
|
return cl::sycl::expm1(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half expm1(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T log(T x) {
|
||||||
|
return cl::sycl::log(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half log(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T ceil(T x) {
|
||||||
|
return cl::sycl::ceil(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half ceil(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T floor(T x) {
|
||||||
|
return cl::sycl::floor(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half floor(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T round(T x) {
|
||||||
|
return cl::sycl::round(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half round(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T log1p(T x) {
|
||||||
|
return cl::sycl::log1p(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half log1p(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T sign(T x) {
|
||||||
|
return cl::sycl::sign(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half sign(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T isnan(T x) {
|
||||||
|
return cl::sycl::isnan(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half isnan(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T isfinite(T x) {
|
||||||
|
return cl::sycl::isfinite(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half isfinite(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T isinf(T x) {
|
||||||
|
return cl::sycl::isinf(x);
|
||||||
|
}
|
||||||
|
template <> Eigen::half isinf(Eigen::half x) {
|
||||||
|
return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,15 +266,14 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
|
|||||||
#define DECLARE_UNARY_STRUCT(FUNC) \
|
#define DECLARE_UNARY_STRUCT(FUNC) \
|
||||||
struct op_##FUNC { \
|
struct op_##FUNC { \
|
||||||
template <typename T> \
|
template <typename T> \
|
||||||
auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \
|
auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) { \
|
||||||
return cl::sycl::FUNC(x); \
|
return SYCL::FUNC(x); \
|
||||||
} \
|
} \
|
||||||
template <typename T> \
|
template <typename T> \
|
||||||
auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
|
auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
|
||||||
return x.FUNC(); \
|
return x.FUNC(); \
|
||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
DECLARE_UNARY_STRUCT(sqrt)
|
DECLARE_UNARY_STRUCT(sqrt)
|
||||||
DECLARE_UNARY_STRUCT(rsqrt)
|
DECLARE_UNARY_STRUCT(rsqrt)
|
||||||
@ -390,8 +498,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
|
|||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
QueueInterface queueInterface(device);
|
QueueInterface queueInterface(device);
|
||||||
Eigen::SyclDevice sycl_device(&queueInterface);
|
Eigen::SyclDevice sycl_device(&queueInterface);
|
||||||
CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
|
CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
|
||||||
CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
|
CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
|
||||||
CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
|
CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
|
||||||
|
CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
|
||||||
|
CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -619,5 +619,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
|
|||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
|
||||||
|
CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -175,6 +175,7 @@ template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(
|
|||||||
}
|
}
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(tensorConcat_perDevice<half>(device));
|
||||||
CALL_SUBTEST(tensorConcat_perDevice<float>(device));
|
CALL_SUBTEST(tensorConcat_perDevice<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -82,6 +82,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_device_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
||||||
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
|
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
|
||||||
}
|
}
|
||||||
|
@ -95,6 +95,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
||||||
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
||||||
CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
|
CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
|
||||||
|
@ -131,6 +131,7 @@ template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -121,6 +121,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -100,6 +100,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -381,6 +381,7 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
406
unsupported/test/cxx11_tensor_of_float16_sycl.cpp
Normal file
406
unsupported/test/cxx11_tensor_of_float16_sycl.cpp
Normal file
@ -0,0 +1,406 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2023
|
||||||
|
// Alejandro Acosta Codeplay Software Ltd.
|
||||||
|
// Contact: <eigen@codeplay.com>
|
||||||
|
// Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||||
|
#define EIGEN_TEST_NO_COMPLEX
|
||||||
|
|
||||||
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
|
#define EIGEN_USE_SYCL
|
||||||
|
#define EIGEN_SYCL_HALF_SUPPORT
|
||||||
|
|
||||||
|
#include "main.h"
|
||||||
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
|
using Eigen::Tensor;
|
||||||
|
using Eigen::SyclDevice;
|
||||||
|
|
||||||
|
void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
|
||||||
|
bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
|
||||||
|
Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
|
||||||
|
Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
|
||||||
|
|
||||||
|
Tensor<bool, 1> half_prec(num_elem);
|
||||||
|
Tensor<bool, 1> full_prec(num_elem);
|
||||||
|
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking numext " << i << std::endl;
|
||||||
|
VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
|
||||||
|
float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
|
||||||
|
d_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
|
||||||
|
d_conv, num_elem);
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random();
|
||||||
|
gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
|
||||||
|
gpu_conv.device(sycl_device) = gpu_half.cast<float>();
|
||||||
|
|
||||||
|
Tensor<float, 1> initial(num_elem);
|
||||||
|
Tensor<float, 1> final(num_elem);
|
||||||
|
sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
VERIFY_IS_APPROX(initial(i), final(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||||
|
d_res_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float, num_elem);
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.abs();
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
|
||||||
|
|
||||||
|
Tensor<float, 1> half_prec(num_elem);
|
||||||
|
Tensor<float, 1> full_prec(num_elem);
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking unary " << i << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
|
||||||
|
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
|
||||||
|
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
|
||||||
|
Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
|
||||||
|
|
||||||
|
gpu_float1.device(sycl_device) = gpu_float1.random();
|
||||||
|
gpu_float2.device(sycl_device) = gpu_float2.random();
|
||||||
|
gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
|
||||||
|
gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
|
||||||
|
|
||||||
|
Tensor<float, 1> half_prec(num_elem);
|
||||||
|
Tensor<float, 1> full_prec(num_elem);
|
||||||
|
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
|
||||||
|
VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
|
||||||
|
|
||||||
|
gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||||
|
gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
|
||||||
|
gpu_float3.device(sycl_device) = gpu_float3.random();
|
||||||
|
gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
|
||||||
|
gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
|
||||||
|
gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
|
||||||
|
gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
|
||||||
|
|
||||||
|
gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
|
||||||
|
gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
|
||||||
|
|
||||||
|
gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
|
||||||
|
gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
|
||||||
|
|
||||||
|
gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
|
||||||
|
gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
|
||||||
|
|
||||||
|
gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
|
||||||
|
gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
|
||||||
|
|
||||||
|
Tensor<float, 1> input1(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> half_prec1(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> full_prec1(num_elem);
|
||||||
|
Tensor<float, 1> input2(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> half_prec2(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> full_prec2(num_elem);
|
||||||
|
Tensor<float, 1> input3(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> half_prec3(num_elem);
|
||||||
|
Tensor<Eigen::half, 1> full_prec3(num_elem);
|
||||||
|
sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
|
||||||
|
if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
|
||||||
|
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
|
||||||
|
else
|
||||||
|
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int rows = 23;
|
||||||
|
int cols = 23;
|
||||||
|
int num_elem = rows*cols;
|
||||||
|
|
||||||
|
float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
|
||||||
|
d_float1, rows, cols);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
|
||||||
|
d_float2, rows, cols);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
|
||||||
|
d_res_half, rows, cols);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float, rows, cols);
|
||||||
|
|
||||||
|
gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
|
||||||
|
gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
|
||||||
|
|
||||||
|
typedef typename Tensor<float, 2>::DimensionPair DimPair;
|
||||||
|
Eigen::array<DimPair, 1> dims;
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
|
||||||
|
|
||||||
|
Tensor<Eigen::half, 2> half_prec(rows, cols);
|
||||||
|
Tensor<Eigen::half, 2> full_prec(rows, cols);
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < rows; ++i) {
|
||||||
|
for (int j = 0; j < cols; ++j) {
|
||||||
|
std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
|
||||||
|
if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
|
||||||
|
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
|
||||||
|
std::cout << "Reducing " << size1 << " by " << size2
|
||||||
|
<< " tensor along dim " << redux << std::endl;
|
||||||
|
|
||||||
|
int num_elem = size1*size2;
|
||||||
|
int result_size = (redux == 1 ? size1 : size2);
|
||||||
|
|
||||||
|
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, size1, size2);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
|
||||||
|
d_res_half, result_size);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float, result_size);
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
|
||||||
|
|
||||||
|
Eigen::array<int, 1> redux_dim = {redux};
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
|
||||||
|
|
||||||
|
Tensor<Eigen::half, 1> half_prec(result_size);
|
||||||
|
Tensor<Eigen::half, 1> full_prec(result_size);
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < result_size; ++i) {
|
||||||
|
std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
test_gpu_reductions(sycl_device, 13, 13, 0);
|
||||||
|
test_gpu_reductions(sycl_device, 13, 13, 1);
|
||||||
|
|
||||||
|
test_gpu_reductions(sycl_device, 35, 36, 0);
|
||||||
|
test_gpu_reductions(sycl_device, 35, 36, 1);
|
||||||
|
|
||||||
|
test_gpu_reductions(sycl_device, 36, 35, 0);
|
||||||
|
test_gpu_reductions(sycl_device, 36, 35, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int size = 13;
|
||||||
|
int num_elem = size*size;
|
||||||
|
|
||||||
|
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
|
||||||
|
Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, size, size);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
|
||||||
|
d_res_half);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float);
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random();
|
||||||
|
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
|
||||||
|
|
||||||
|
Tensor<Eigen::half, 0> half_prec;
|
||||||
|
Tensor<Eigen::half, 0> full_prec;
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||||
|
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
|
||||||
|
gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
VERIFY_IS_APPROX(full_prec(), half_prec());
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
|
||||||
|
int num_elem = 101;
|
||||||
|
|
||||||
|
float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
|
||||||
|
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
|
||||||
|
d_float, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
|
||||||
|
d_res_half1, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
|
||||||
|
d_res_half2, num_elem);
|
||||||
|
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||||
|
d_res_float, num_elem);
|
||||||
|
|
||||||
|
Eigen::array<int, 1> no_bcast;
|
||||||
|
no_bcast[0] = 1;
|
||||||
|
|
||||||
|
gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
|
||||||
|
gpu_res_float.device(sycl_device) = gpu_float.abs();
|
||||||
|
gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
|
||||||
|
gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
|
||||||
|
|
||||||
|
Tensor<float, 1> half_prec1(num_elem);
|
||||||
|
Tensor<float, 1> half_prec2(num_elem);
|
||||||
|
Tensor<float, 1> full_prec(num_elem);
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
|
||||||
|
sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||||
|
sycl_device.synchronize();
|
||||||
|
|
||||||
|
for (int i = 0; i < num_elem; ++i) {
|
||||||
|
std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
|
||||||
|
VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
|
||||||
|
VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
|
||||||
|
{
|
||||||
|
for (const auto& s : Eigen::get_sycl_supported_devices()) {
|
||||||
|
QueueInterface queueInterface(s);
|
||||||
|
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||||
|
|
||||||
|
CALL_SUBTEST_1(test_gpu_numext(sycl_device));
|
||||||
|
CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
|
||||||
|
CALL_SUBTEST_1(test_gpu_unary(sycl_device));
|
||||||
|
CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
|
||||||
|
CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
|
||||||
|
CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
|
||||||
|
CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
|
||||||
|
CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
|
||||||
|
CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
|
||||||
|
}
|
||||||
|
}
|
@ -152,6 +152,7 @@ template<typename DataType, typename dev_Selector> void sycl_padding_test_per_de
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -244,6 +244,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_p
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -78,6 +78,7 @@ template<typename DataType, typename dev_Selector> void sycl_random_test_per_dev
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_random_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_random_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_random_test_per_device<float>(device));
|
||||||
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
||||||
CALL_SUBTEST(sycl_random_test_per_device<double>(device));
|
CALL_SUBTEST(sycl_random_test_per_device<double>(device));
|
||||||
|
@ -248,6 +248,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
|
|||||||
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
#ifdef EIGEN_SYCL_DOUBLE_SUPPORT
|
||||||
CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
|
CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
|
||||||
#endif
|
#endif
|
||||||
CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
|
CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
|
||||||
|
CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -112,6 +112,7 @@ void sycl_shuffling_test_per_device(dev_Selector s) {
|
|||||||
}
|
}
|
||||||
EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
|
||||||
for (const auto& device : Eigen::get_sycl_supported_devices()) {
|
for (const auto& device : Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -217,6 +217,7 @@ test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
|
EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
|
||||||
{
|
{
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
|
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
|
||||||
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user