From ba47341a14d3c883307548d19a095c066fbb9830 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Thu, 27 Apr 2023 14:01:11 +0100 Subject: [PATCH] [SYCL-2020] Enabling half precision support for SYCL. --- Eigen/src/Core/arch/SYCL/InteropHeaders.h | 4 + Eigen/src/Core/arch/SYCL/MathFunctions.h | 24 ++ Eigen/src/Core/arch/SYCL/PacketMath.h | 292 ++++++++++++- .../CXX11/src/Tensor/TensorContractionSycl.h | 4 +- unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_argmax_sycl.cpp | 15 +- .../test/cxx11_tensor_broadcast_sycl.cpp | 1 + .../test/cxx11_tensor_builtins_sycl.cpp | 138 +++++- .../test/cxx11_tensor_chipping_sycl.cpp | 1 + .../test/cxx11_tensor_concatenation_sycl.cpp | 1 + unsupported/test/cxx11_tensor_device_sycl.cpp | 1 + .../test/cxx11_tensor_image_op_sycl.cpp | 1 + .../test/cxx11_tensor_inflation_sycl.cpp | 1 + .../test/cxx11_tensor_layout_swap_sycl.cpp | 1 + unsupported/test/cxx11_tensor_math_sycl.cpp | 1 + .../test/cxx11_tensor_morphing_sycl.cpp | 1 + .../test/cxx11_tensor_of_float16_sycl.cpp | 406 ++++++++++++++++++ .../test/cxx11_tensor_padding_sycl.cpp | 1 + unsupported/test/cxx11_tensor_patch_sycl.cpp | 1 + unsupported/test/cxx11_tensor_random_sycl.cpp | 1 + .../test/cxx11_tensor_reverse_sycl.cpp | 3 +- .../test/cxx11_tensor_shuffling_sycl.cpp | 1 + .../test/cxx11_tensor_volume_patch_sycl.cpp | 1 + 23 files changed, 873 insertions(+), 28 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_of_float16_sycl.cpp diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h index ed4edc11e..27d9a82d3 100644 --- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -86,6 +86,8 @@ struct sycl_packet_traits : default_packet_traits { typedef packet_type half; \ }; +SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8) +SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8) SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) @@ -100,6 +102,7 @@ SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) struct is_arithmetic { \ enum { value = true }; \ }; +SYCL_ARITHMETIC(cl::sycl::cl_half8) SYCL_ARITHMETIC(cl::sycl::cl_float4) SYCL_ARITHMETIC(cl::sycl::cl_double2) #undef SYCL_ARITHMETIC @@ -111,6 +114,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2) enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \ typedef packet_type half; \ }; +SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8) SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h index bdbb21f9f..a8adc46c5 100644 --- a/Eigen/src/Core/arch/SYCL/MathFunctions.h +++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -38,6 +38,7 @@ namespace internal { return cl::sycl::log(a); \ } +SYCL_PLOG(cl::sycl::cl_half8) SYCL_PLOG(cl::sycl::cl_float4) SYCL_PLOG(cl::sycl::cl_double2) #undef SYCL_PLOG @@ -49,6 +50,7 @@ SYCL_PLOG(cl::sycl::cl_double2) return cl::sycl::log1p(a); \ } +SYCL_PLOG1P(cl::sycl::cl_half8) SYCL_PLOG1P(cl::sycl::cl_float4) SYCL_PLOG1P(cl::sycl::cl_double2) #undef SYCL_PLOG1P @@ -60,6 +62,7 @@ SYCL_PLOG1P(cl::sycl::cl_double2) return cl::sycl::log10(a); \ } +SYCL_PLOG10(cl::sycl::cl_half8) SYCL_PLOG10(cl::sycl::cl_float4) SYCL_PLOG10(cl::sycl::cl_double2) #undef SYCL_PLOG10 @@ -71,6 +74,8 @@ SYCL_PLOG10(cl::sycl::cl_double2) return cl::sycl::exp(a); \ } +SYCL_PEXP(cl::sycl::cl_half8) +SYCL_PEXP(cl::sycl::cl_half) SYCL_PEXP(cl::sycl::cl_float4) SYCL_PEXP(cl::sycl::cl_float) SYCL_PEXP(cl::sycl::cl_double2) @@ -83,6 +88,7 @@ SYCL_PEXP(cl::sycl::cl_double2) return cl::sycl::expm1(a); \ } +SYCL_PEXPM1(cl::sycl::cl_half8) SYCL_PEXPM1(cl::sycl::cl_float4) SYCL_PEXPM1(cl::sycl::cl_double2) #undef SYCL_PEXPM1 @@ -94,6 +100,7 @@ SYCL_PEXPM1(cl::sycl::cl_double2) return cl::sycl::sqrt(a); \ } +SYCL_PSQRT(cl::sycl::cl_half8) SYCL_PSQRT(cl::sycl::cl_float4) SYCL_PSQRT(cl::sycl::cl_double2) #undef SYCL_PSQRT @@ -105,6 +112,7 @@ SYCL_PSQRT(cl::sycl::cl_double2) return cl::sycl::rsqrt(a); \ } +SYCL_PRSQRT(cl::sycl::cl_half8) SYCL_PRSQRT(cl::sycl::cl_float4) SYCL_PRSQRT(cl::sycl::cl_double2) #undef SYCL_PRSQRT @@ -117,6 +125,7 @@ SYCL_PRSQRT(cl::sycl::cl_double2) return cl::sycl::sin(a); \ } +SYCL_PSIN(cl::sycl::cl_half8) SYCL_PSIN(cl::sycl::cl_float4) SYCL_PSIN(cl::sycl::cl_double2) #undef SYCL_PSIN @@ -129,6 +138,7 @@ SYCL_PSIN(cl::sycl::cl_double2) return cl::sycl::cos(a); \ } +SYCL_PCOS(cl::sycl::cl_half8) SYCL_PCOS(cl::sycl::cl_float4) SYCL_PCOS(cl::sycl::cl_double2) #undef SYCL_PCOS @@ -141,6 +151,7 @@ SYCL_PCOS(cl::sycl::cl_double2) return cl::sycl::tan(a); \ } +SYCL_PTAN(cl::sycl::cl_half8) SYCL_PTAN(cl::sycl::cl_float4) SYCL_PTAN(cl::sycl::cl_double2) #undef SYCL_PTAN @@ -153,6 +164,7 @@ SYCL_PTAN(cl::sycl::cl_double2) return cl::sycl::asin(a); \ } +SYCL_PASIN(cl::sycl::cl_half8) SYCL_PASIN(cl::sycl::cl_float4) SYCL_PASIN(cl::sycl::cl_double2) #undef SYCL_PASIN @@ -165,6 +177,7 @@ SYCL_PASIN(cl::sycl::cl_double2) return cl::sycl::acos(a); \ } +SYCL_PACOS(cl::sycl::cl_half8) SYCL_PACOS(cl::sycl::cl_float4) SYCL_PACOS(cl::sycl::cl_double2) #undef SYCL_PACOS @@ -177,6 +190,7 @@ SYCL_PACOS(cl::sycl::cl_double2) return cl::sycl::atan(a); \ } +SYCL_PATAN(cl::sycl::cl_half8) SYCL_PATAN(cl::sycl::cl_float4) SYCL_PATAN(cl::sycl::cl_double2) #undef SYCL_PATAN @@ -189,6 +203,7 @@ SYCL_PATAN(cl::sycl::cl_double2) return cl::sycl::sinh(a); \ } +SYCL_PSINH(cl::sycl::cl_half8) SYCL_PSINH(cl::sycl::cl_float4) SYCL_PSINH(cl::sycl::cl_double2) #undef SYCL_PSINH @@ -201,6 +216,7 @@ SYCL_PSINH(cl::sycl::cl_double2) return cl::sycl::cosh(a); \ } +SYCL_PCOSH(cl::sycl::cl_half8) SYCL_PCOSH(cl::sycl::cl_float4) SYCL_PCOSH(cl::sycl::cl_double2) #undef SYCL_PCOSH @@ -213,6 +229,7 @@ SYCL_PCOSH(cl::sycl::cl_double2) return cl::sycl::tanh(a); \ } +SYCL_PTANH(cl::sycl::cl_half8) SYCL_PTANH(cl::sycl::cl_float4) SYCL_PTANH(cl::sycl::cl_double2) #undef SYCL_PTANH @@ -224,6 +241,7 @@ SYCL_PTANH(cl::sycl::cl_double2) return cl::sycl::ceil(a); \ } +SYCL_PCEIL(cl::sycl::cl_half) SYCL_PCEIL(cl::sycl::cl_float4) SYCL_PCEIL(cl::sycl::cl_double2) #undef SYCL_PCEIL @@ -235,6 +253,7 @@ SYCL_PCEIL(cl::sycl::cl_double2) return cl::sycl::round(a); \ } +SYCL_PROUND(cl::sycl::cl_half8) SYCL_PROUND(cl::sycl::cl_float4) SYCL_PROUND(cl::sycl::cl_double2) #undef SYCL_PROUND @@ -246,6 +265,7 @@ SYCL_PROUND(cl::sycl::cl_double2) return cl::sycl::rint(a); \ } +SYCL_PRINT(cl::sycl::cl_half8) SYCL_PRINT(cl::sycl::cl_float4) SYCL_PRINT(cl::sycl::cl_double2) #undef SYCL_PRINT @@ -257,6 +277,7 @@ SYCL_PRINT(cl::sycl::cl_double2) return cl::sycl::floor(a); \ } +SYCL_FLOOR(cl::sycl::cl_half8) SYCL_FLOOR(cl::sycl::cl_float4) SYCL_FLOOR(cl::sycl::cl_double2) #undef SYCL_FLOOR @@ -268,6 +289,7 @@ SYCL_FLOOR(cl::sycl::cl_double2) return expr; \ } +SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b)) SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) #undef SYCL_PMIN @@ -279,6 +301,7 @@ SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) return expr; \ } +SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b)) SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) #undef SYCL_PMAX @@ -292,6 +315,7 @@ SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) cl::sycl::rounding_mode::automatic>()); \ } +SYCL_PLDEXP(cl::sycl::cl_half8) SYCL_PLDEXP(cl::sycl::cl_float4) SYCL_PLDEXP(cl::sycl::cl_double2) #undef SYCL_PLDEXP diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h index fbdcdc024..4b0b1c6c0 100644 --- a/Eigen/src/Core/arch/SYCL/PacketMath.h +++ b/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -44,9 +44,34 @@ SYCL_PLOAD(cl::sycl::cl_float4, u) SYCL_PLOAD(cl::sycl::cl_float4, ) SYCL_PLOAD(cl::sycl::cl_double2, u) SYCL_PLOAD(cl::sycl::cl_double2, ) - #undef SYCL_PLOAD +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 + pload( + const typename unpacket_traits::type* from) { + auto ptr = cl::sycl::address_space_cast< + cl::sycl::access::address_space::generic_space, + cl::sycl::access::decorated::no>( + reinterpret_cast(from)); + cl::sycl::cl_half8 res{}; + res.load(0, ptr); + return res; +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 +ploadu( + const typename unpacket_traits::type* from) { + auto ptr = cl::sycl::address_space_cast< + cl::sycl::access::address_space::generic_space, + cl::sycl::access::decorated::no>( + reinterpret_cast(from)); + cl::sycl::cl_half8 res{}; + res.load(0, ptr); + return res; +} + #define SYCL_PSTORE(scalar, packet_type, alignment) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ @@ -59,9 +84,28 @@ SYCL_PSTORE(float, cl::sycl::cl_float4, ) SYCL_PSTORE(float, cl::sycl::cl_float4, u) SYCL_PSTORE(double, cl::sycl::cl_double2, ) SYCL_PSTORE(double, cl::sycl::cl_double2, u) - #undef SYCL_PSTORE +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu( + Eigen::half* to, const cl::sycl::cl_half8& from) { + auto ptr = cl::sycl::address_space_cast< + cl::sycl::access::address_space::generic_space, + cl::sycl::access::decorated::no>( + reinterpret_cast(to)); + from.store(0, ptr); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore( + Eigen::half* to, const cl::sycl::cl_half8& from) { + auto ptr = cl::sycl::address_space_cast< + cl::sycl::access::address_space::generic_space, + cl::sycl::access::decorated::no>( + reinterpret_cast(to)); + from.store(0, ptr); +} + #define SYCL_PSET1(packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1( \ @@ -70,6 +114,7 @@ SYCL_PSTORE(double, cl::sycl::cl_double2, u) } // global space +SYCL_PSET1(cl::sycl::cl_half8) SYCL_PSET1(cl::sycl::cl_float4) SYCL_PSET1(cl::sycl::cl_double2) @@ -86,6 +131,58 @@ struct get_base_packet { get_pgather(sycl_multi_pointer, Index) {} }; +template <> +struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup( + sycl_multi_pointer from) { + return cl::sycl::cl_half8(static_cast(from[0]), + static_cast(from[0]), + static_cast(from[1]), + static_cast(from[1]), + static_cast(from[2]), + static_cast(from[2]), + static_cast(from[3]), + static_cast(from[3])); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather( + sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_half8(static_cast(from[0 * stride]), + static_cast(from[1 * stride]), + static_cast(from[2 * stride]), + static_cast(from[3 * stride]), + static_cast(from[4 * stride]), + static_cast(from[5 * stride]), + static_cast(from[6 * stride]), + static_cast(from[7 * stride])); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( + sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) { + auto tmp = stride; + to[0] = Eigen::half(from.s0()); + to[tmp] = Eigen::half(from.s1()); + to[tmp += stride] = Eigen::half(from.s2()); + to[tmp += stride] = Eigen::half(from.s3()); + to[tmp += stride] = Eigen::half(from.s4()); + to[tmp += stride] = Eigen::half(from.s5()); + to[tmp += stride] = Eigen::half(from.s6()); + to[tmp += stride] = Eigen::half(from.s7()); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset( + const cl::sycl::half& a) { + return cl::sycl::cl_half8(static_cast(a), static_cast(a + 1), + static_cast(a + 2), + static_cast(a + 3), + static_cast(a + 4), + static_cast(a + 5), + static_cast(a + 6), + static_cast(a + 7)); + } +}; + template <> struct get_base_packet { template @@ -152,6 +249,7 @@ struct get_base_packet { return get_base_packet::get_ploaddup(from); \ } +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8) SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) @@ -165,9 +263,14 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) } SYCL_PLSET(cl::sycl::cl_float4) SYCL_PLSET(cl::sycl::cl_double2) - #undef SYCL_PLSET +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset( + const typename unpacket_traits::type& a) { + return get_base_packet::set_plset((const cl::sycl::half &) a); +} + #define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ @@ -176,9 +279,9 @@ SYCL_PLSET(cl::sycl::cl_double2) return get_base_packet::get_pgather(from, stride); \ } +SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8) SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) - #undef SYCL_PGATHER_SPECILIZE #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ @@ -189,6 +292,7 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) get_base_packet::set_pscatter(to, from, stride); \ } +SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8) SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) @@ -201,10 +305,16 @@ SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) return cl::sycl::mad(a, b, c); \ } +SYCL_PMAD(cl::sycl::cl_half8) SYCL_PMAD(cl::sycl::cl_float4) SYCL_PMAD(cl::sycl::cl_double2) #undef SYCL_PMAD +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst( + const cl::sycl::cl_half8& a) { + return Eigen::half(a.s0()); +} template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst( const cl::sycl::cl_float4& a) { @@ -216,6 +326,13 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst( return a.x(); } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux( + const cl::sycl::cl_half8& a) { + return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5() + + a.s6() + a.s7()); +} + template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux( const cl::sycl::cl_float4& a) { @@ -228,6 +345,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux( return a.x() + a.y(); } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max( + const cl::sycl::cl_half8& a) { + return Eigen::half(cl::sycl::fmax( + cl::sycl::fmax( + cl::sycl::fmax(a.s0(), a.s1()), + cl::sycl::fmax(a.s2(), a.s3())), + cl::sycl::fmax( + cl::sycl::fmax(a.s4(), a.s5()), + cl::sycl::fmax(a.s6(), a.s7())))); +} template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max( const cl::sycl::cl_float4& a) { @@ -240,6 +368,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max( return cl::sycl::fmax(a.x(), a.y()); } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min( + const cl::sycl::cl_half8& a) { + return Eigen::half(cl::sycl::fmin( + cl::sycl::fmin( + cl::sycl::fmin(a.s0(), a.s1()), + cl::sycl::fmin(a.s2(), a.s3())), + cl::sycl::fmin( + cl::sycl::fmin(a.s4(), a.s5()), + cl::sycl::fmin(a.s6(), a.s7())))); +} template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min( const cl::sycl::cl_float4& a) { @@ -252,6 +391,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min( return cl::sycl::fmin(a.x(), a.y()); } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul( + const cl::sycl::cl_half8& a) { + return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() * + a.s6() * a.s7()); +} template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul( const cl::sycl::cl_float4& a) { @@ -263,6 +408,14 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul( return a.x() * a.y(); } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 +pabs(const cl::sycl::cl_half8& a) { + return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()), + cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()), + cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()), + cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7())); +} template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs(const cl::sycl::cl_float4& a) { @@ -300,6 +453,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a, return sycl_pcmp_##OP(a, b); \ } +SYCL_PCMP(le, cl::sycl::cl_half8) +SYCL_PCMP(lt, cl::sycl::cl_half8) +SYCL_PCMP(eq, cl::sycl::cl_half8) SYCL_PCMP(le, cl::sycl::cl_float4) SYCL_PCMP(lt, cl::sycl::cl_float4) SYCL_PCMP(eq, cl::sycl::cl_float4) @@ -308,6 +464,121 @@ SYCL_PCMP(lt, cl::sycl::cl_double2) SYCL_PCMP(eq, cl::sycl::cl_double2) #undef SYCL_PCMP +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( + PacketBlock& kernel) { + cl::sycl::cl_half tmp = kernel.packet[0].s1(); + kernel.packet[0].s1() = kernel.packet[1].s0(); + kernel.packet[1].s0() = tmp; + + tmp = kernel.packet[0].s2(); + kernel.packet[0].s2() = kernel.packet[2].s0(); + kernel.packet[2].s0() = tmp; + + tmp = kernel.packet[0].s3(); + kernel.packet[0].s3() = kernel.packet[3].s0(); + kernel.packet[3].s0() = tmp; + + tmp = kernel.packet[0].s4(); + kernel.packet[0].s4() = kernel.packet[4].s0(); + kernel.packet[4].s0() = tmp; + + tmp = kernel.packet[0].s5(); + kernel.packet[0].s5() = kernel.packet[5].s0(); + kernel.packet[5].s0() = tmp; + + tmp = kernel.packet[0].s6(); + kernel.packet[0].s6() = kernel.packet[6].s0(); + kernel.packet[6].s0() = tmp; + + tmp = kernel.packet[0].s7(); + kernel.packet[0].s7() = kernel.packet[7].s0(); + kernel.packet[7].s0() = tmp; + + tmp = kernel.packet[1].s2(); + kernel.packet[1].s2() = kernel.packet[2].s1(); + kernel.packet[2].s1() = tmp; + + tmp = kernel.packet[1].s3(); + kernel.packet[1].s3() = kernel.packet[3].s1(); + kernel.packet[3].s1() = tmp; + + tmp = kernel.packet[1].s4(); + kernel.packet[1].s4() = kernel.packet[4].s1(); + kernel.packet[4].s1() = tmp; + + tmp = kernel.packet[1].s5(); + kernel.packet[1].s5() = kernel.packet[5].s1(); + kernel.packet[5].s1() = tmp; + + tmp = kernel.packet[1].s6(); + kernel.packet[1].s6() = kernel.packet[6].s1(); + kernel.packet[6].s1() = tmp; + + tmp = kernel.packet[1].s7(); + kernel.packet[1].s7() = kernel.packet[7].s1(); + kernel.packet[7].s1() = tmp; + + tmp = kernel.packet[2].s3(); + kernel.packet[2].s3() = kernel.packet[3].s2(); + kernel.packet[3].s2() = tmp; + + tmp = kernel.packet[2].s4(); + kernel.packet[2].s4() = kernel.packet[4].s2(); + kernel.packet[4].s2() = tmp; + + tmp = kernel.packet[2].s5(); + kernel.packet[2].s5() = kernel.packet[5].s2(); + kernel.packet[5].s2() = tmp; + + tmp = kernel.packet[2].s6(); + kernel.packet[2].s6() = kernel.packet[6].s2(); + kernel.packet[6].s2() = tmp; + + tmp = kernel.packet[2].s7(); + kernel.packet[2].s7() = kernel.packet[7].s2(); + kernel.packet[7].s2() = tmp; + + tmp = kernel.packet[3].s4(); + kernel.packet[3].s4() = kernel.packet[4].s3(); + kernel.packet[4].s3() = tmp; + + tmp = kernel.packet[3].s5(); + kernel.packet[3].s5() = kernel.packet[5].s3(); + kernel.packet[5].s3() = tmp; + + tmp = kernel.packet[3].s6(); + kernel.packet[3].s6() = kernel.packet[6].s3(); + kernel.packet[6].s3() = tmp; + + tmp = kernel.packet[3].s7(); + kernel.packet[3].s7() = kernel.packet[7].s3(); + kernel.packet[7].s3() = tmp; + + tmp = kernel.packet[4].s5(); + kernel.packet[4].s5() = kernel.packet[5].s4(); + kernel.packet[5].s4() = tmp; + + tmp = kernel.packet[4].s6(); + kernel.packet[4].s6() = kernel.packet[6].s4(); + kernel.packet[6].s4() = tmp; + + tmp = kernel.packet[4].s7(); + kernel.packet[4].s7() = kernel.packet[7].s4(); + kernel.packet[7].s4() = tmp; + + tmp = kernel.packet[5].s6(); + kernel.packet[5].s6() = kernel.packet[6].s5(); + kernel.packet[6].s5() = tmp; + + tmp = kernel.packet[5].s7(); + kernel.packet[5].s7() = kernel.packet[7].s5(); + kernel.packet[7].s5() = tmp; + + tmp = kernel.packet[6].s7(); + kernel.packet[6].s7() = kernel.packet[7].s6(); + kernel.packet[7].s6() = tmp; +} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( PacketBlock& kernel) { float tmp = kernel.packet[0].y(); @@ -342,6 +613,19 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( kernel.packet[1].x() = tmp; } +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend( + const Selector::size>& ifPacket, + const cl::sycl::cl_half8& thenPacket, + const cl::sycl::cl_half8& elsePacket) { + cl::sycl::cl_short8 condition( + ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, + ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1, + ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1, + ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend( const Selector::size>& ifPacket, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index cb74031fa..480e3ac02 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -1118,7 +1118,7 @@ struct GeneralVectorTensor { : globalContractDimOffset + privateOffsetC) : OutScalar(0); - outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]); privateOffsetNC += Properties::LocalThreadSizeNC; } privateOffsetC += Properties::LocalThreadSizeC; @@ -1263,7 +1263,7 @@ struct GeneralScalarContraction { StorageIndex localid = itemID.get_local_id(0); OutScalar accumulator = OutScalar(0); for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { - accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator); } auto out_scratch_ptr = scratch_ptr + localid; *out_scratch_ptr = accumulator; diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 1d40ae56c..d41baf2ea 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -151,6 +151,7 @@ if(EIGEN_TEST_SYCL) ei_add_test(cxx11_tensor_argmax_sycl) ei_add_test(cxx11_tensor_custom_op_sycl) ei_add_test(cxx11_tensor_scan_sycl) + ei_add_test(cxx11_tensor_of_float16_sycl) set(EIGEN_SYCL OFF) endif() diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 41ea3cf7b..8f4e09573 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -32,9 +32,9 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) { Tensor out_max; Tensor out_min; in.setRandom(); - in *= in.constant(100.0); - in(0, 0, 0) = -1000.0; - in(1, 1, 1) = 1000.0; + in *= in.constant(static_cast(100.0)); + in(0, 0, 0) = static_cast(-1000.0); + in(1, 1, 1) = static_cast(1000.0); std::size_t in_bytes = in.size() * sizeof(DataType); std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); @@ -93,7 +93,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) { ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) // = 10.0 - tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0; + tensor(ix) = static_cast((ix[dim] != 0) ? -1.0 : 10.0); } } } @@ -132,7 +132,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) { ix[2] = k; ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0; + tensor(ix) = static_cast((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0); } } } @@ -180,7 +180,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) { ix[2] = k; ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0 - tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0; + tensor(ix) = static_cast((ix[dim] != 0) ? 1.0 : -10.0); } } } @@ -219,7 +219,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) { ix[2] = k; ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0 - tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0; + tensor(ix) = static_cast((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0); } } } @@ -252,6 +252,7 @@ void sycl_argmax_test_per_device(const Device_Selector& d) { EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device(device)); CALL_SUBTEST(sycl_argmax_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index 20f84b8e0..6ca5ac714 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -139,6 +139,7 @@ template void sycl_broadcast_test_per_device(const cl::sycl:: EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_broadcast_test_per_device(device)); CALL_SUBTEST(sycl_broadcast_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index be7afd200..c8668d847 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -28,26 +28,135 @@ using Eigen::TensorMap; // Functions used to compare the TensorMap implementation on the device with // the equivalent on the host namespace SYCL { -template T abs(T x) { + +template T abs(T x) { return cl::sycl::abs(x); } +template <> Eigen::half abs(Eigen::half x) { + return Eigen::half(cl::sycl::fabs(static_cast(x))); +} -template <> float abs(float x) { +template <> float abs(float x) { return cl::sycl::fabs(x); } -template <> double abs(double x) { +template <> double abs(double x) { return cl::sycl::fabs(x); } template T square(T x) { return x * x; } template T cube(T x) { return x * x * x; } template T inverse(T x) { return T(1) / x; } -template T cwiseMax(T x, T y) { -return cl::sycl::max(x, y); +template T cwiseMax(T x, T y) { + return cl::sycl::max(x, y); } -template T cwiseMin(T x, T y) { - return cl::sycl::min(x, y); +template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) { +return Eigen::half(cl::sycl::max(static_cast(x), static_cast(y))); +} + +template T cwiseMin(T x, T y) { + return cl::sycl::min(x, y); +} +template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) { + return Eigen::half(cl::sycl::min(static_cast(x), static_cast(y))); +} + +template T sqrt(T x) { + return cl::sycl::sqrt(x); +} +template <> Eigen::half sqrt(Eigen::half x) { + return Eigen::half(cl::sycl::sqrt(static_cast(x))); +} + +template T rsqrt(T x) { + return cl::sycl::rsqrt(x); +} +template <> Eigen::half rsqrt(Eigen::half x) { + return Eigen::half(cl::sycl::rsqrt(static_cast(x))); +} + +template T tanh(T x) { + return cl::sycl::tanh(x); +} +template <> Eigen::half tanh(Eigen::half x) { + return Eigen::half(cl::sycl::tanh(static_cast(x))); +} + +template T exp(T x) { + return cl::sycl::exp(x); +} +template <> Eigen::half exp(Eigen::half x) { + return Eigen::half(cl::sycl::exp(static_cast(x))); +} + +template T expm1(T x) { + return cl::sycl::expm1(x); +} +template <> Eigen::half expm1(Eigen::half x) { + return Eigen::half(cl::sycl::expm1(static_cast(x))); +} + +template T log(T x) { + return cl::sycl::log(x); +} +template <> Eigen::half log(Eigen::half x) { + return Eigen::half(cl::sycl::log(static_cast(x))); +} + +template T ceil(T x) { + return cl::sycl::ceil(x); +} +template <> Eigen::half ceil(Eigen::half x) { + return Eigen::half(cl::sycl::ceil(static_cast(x))); +} + +template T floor(T x) { + return cl::sycl::floor(x); +} +template <> Eigen::half floor(Eigen::half x) { + return Eigen::half(cl::sycl::floor(static_cast(x))); +} + +template T round(T x) { + return cl::sycl::round(x); +} +template <> Eigen::half round(Eigen::half x) { + return Eigen::half(cl::sycl::round(static_cast(x))); +} + +template T log1p(T x) { + return cl::sycl::log1p(x); +} +template <> Eigen::half log1p(Eigen::half x) { + return Eigen::half(cl::sycl::log1p(static_cast(x))); +} + +template T sign(T x) { + return cl::sycl::sign(x); +} +template <> Eigen::half sign(Eigen::half x) { + return Eigen::half(cl::sycl::sign(static_cast(x))); +} + +template T isnan(T x) { + return cl::sycl::isnan(x); +} +template <> Eigen::half isnan(Eigen::half x) { + return Eigen::half(cl::sycl::isnan(static_cast(x))); +} + +template T isfinite(T x) { + return cl::sycl::isfinite(x); +} +template <> Eigen::half isfinite(Eigen::half x) { + return Eigen::half(cl::sycl::isfinite(static_cast(x))); +} + +template T isinf(T x) { + return cl::sycl::isinf(x); +} +template <> Eigen::half isinf(Eigen::half x) { + return Eigen::half(cl::sycl::isinf(static_cast(x))); } } @@ -157,15 +266,14 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, #define DECLARE_UNARY_STRUCT(FUNC) \ struct op_##FUNC { \ template \ - auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ - return cl::sycl::FUNC(x); \ + auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) { \ + return SYCL::FUNC(x); \ } \ template \ auto operator()(const TensorMap& x) -> decltype(x.FUNC()) { \ return x.FUNC(); \ } \ - }; - +}; DECLARE_UNARY_STRUCT(sqrt) DECLARE_UNARY_STRUCT(rsqrt) @@ -390,8 +498,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { QueueInterface queueInterface(device); Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST_1(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST_2(test_floating_builtin_binary_sycl(sycl_device)); - CALL_SUBTEST_3(test_integer_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_1(test_builtin_unary_sycl(sycl_device)); + CALL_SUBTEST_2(test_floating_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_3(test_builtin_unary_sycl(sycl_device)); + CALL_SUBTEST_4(test_floating_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_5(test_integer_builtin_binary_sycl(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp index 1e7093104..b018da89a 100644 --- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -619,5 +619,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_chipping_test_per_device(device)); + CALL_SUBTEST(sycl_chipping_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp index 765991b35..18dc95b67 100644 --- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -175,6 +175,7 @@ template void tensorConcat_perDevice( } EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConcat_perDevice(device)); CALL_SUBTEST(tensorConcat_perDevice(device)); } } diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 2f75ef896..66a13e817 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -82,6 +82,7 @@ template void sycl_device_test_per_device(const cl::sycl::dev EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_device_test_per_device(device)); CALL_SUBTEST(sycl_device_test_per_device(device)); CALL_SUBTEST(sycl_device_test_per_device>(device)); } diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp index db1c0206e..083c004fe 100644 --- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp +++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp @@ -95,6 +95,7 @@ template void sycl_computing_test_per_ EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device(device)); CALL_SUBTEST(sycl_computing_test_per_device(device)); #ifdef EIGEN_SYCL_DOUBLE_SUPPORT CALL_SUBTEST(sycl_computing_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp index 521ae0cc3..75c2c0e3e 100644 --- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -131,6 +131,7 @@ template void sycl_inflation_test_per_ EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_inflation_test_per_device(device)); CALL_SUBTEST(sycl_inflation_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp index 9546b911c..f556d84ea 100644 --- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -121,6 +121,7 @@ template void sycl_tensor_layout_swap_ EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device(device)); CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp index 029653e27..84d638e5b 100644 --- a/unsupported/test/cxx11_tensor_math_sycl.cpp +++ b/unsupported/test/cxx11_tensor_math_sycl.cpp @@ -100,6 +100,7 @@ template void sycl_computing_test_per_ EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device(device)); CALL_SUBTEST(sycl_computing_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index bf001b40f..a1545d49a 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -381,6 +381,7 @@ template void sycl_morphing_test_per_d EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_morphing_test_per_device(device)); CALL_SUBTEST(sycl_morphing_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp new file mode 100644 index 000000000..1624acc4f --- /dev/null +++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp @@ -0,0 +1,406 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 +// Alejandro Acosta Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL +#define EIGEN_SYCL_HALF_SUPPORT + +#include "main.h" +#include + +using Eigen::Tensor; +using Eigen::SyclDevice; + +void test_gpu_numext(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + bool* d_res_half = static_cast(sycl_device.allocate(num_elem * sizeof(bool))); + bool* d_res_float = static_cast(sycl_device.allocate(num_elem * sizeof(bool))); + + Eigen::TensorMap, Eigen::Aligned> gpu_float(d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half(d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float(d_res_float, num_elem); + + gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op()); + gpu_res_half.device(sycl_device) = gpu_float.cast().unaryExpr(Eigen::internal::scalar_isnan_op()); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool)); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking numext " << i << std::endl; + VERIFY_IS_EQUAL(full_prec(i), half_prec(i)); + } +} + +void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + Eigen::half* d_half = static_cast(sycl_device.allocate(num_elem * sizeof(Eigen::half))); + float* d_conv = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_half( + d_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_conv( + d_conv, num_elem); + + gpu_float.device(sycl_device) = gpu_float.random(); + gpu_half.device(sycl_device) = gpu_float.cast(); + gpu_conv.device(sycl_device) = gpu_half.cast(); + + Tensor initial(num_elem); + Tensor final(num_elem); + sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + VERIFY_IS_APPROX(initial(i), final(i)); + } +} + +void test_gpu_unary(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(sycl_device) = gpu_float.abs(); + gpu_res_half.device(sycl_device) = gpu_float.cast().abs().cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + sycl_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } +} + +void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float1 = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + float* d_float2 = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + float* d_res_half = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + float* d_res_float = static_cast(sycl_device.allocate(num_elem * sizeof(float))); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half(d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float(d_res_float, num_elem); + + gpu_float1.device(sycl_device) = gpu_float1.random(); + gpu_float2.device(sycl_device) = gpu_float2.random(); + gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1; + gpu_res_half.device(sycl_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float)); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(static_cast(full_prec(i)), static_cast(half_prec(i))); + } +} + +void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float3(d_float3, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem); + + gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f); + gpu_float3.device(sycl_device) = gpu_float3.random(); + gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast(); + gpu_res2_float.device(sycl_device) = gpu_float2.log().cast(); + gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast(); + gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast(); + + gpu_res1_half.device(sycl_device) = gpu_float1.cast(); + gpu_res1_half.device(sycl_device) = gpu_res1_half.exp(); + + gpu_res2_half.device(sycl_device) = gpu_float2.cast(); + gpu_res2_half.device(sycl_device) = gpu_res2_half.log(); + + gpu_res3_half.device(sycl_device) = gpu_float3.cast(); + gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p(); + + gpu_res3_half.device(sycl_device) = gpu_float3.cast(); + gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1(); + + Tensor input1(num_elem); + Tensor half_prec1(num_elem); + Tensor full_prec1(num_elem); + Tensor input2(num_elem); + Tensor half_prec2(num_elem); + Tensor full_prec2(num_elem); + Tensor input3(num_elem); + Tensor half_prec3(num_elem); + Tensor full_prec3(num_elem); + sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); + sycl_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl; + VERIFY_IS_APPROX(full_prec1(i), half_prec1(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; + if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1 + VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); + else + VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); + } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; + VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); + } +} + +void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) { + int rows = 23; + int cols = 23; + int num_elem = rows*cols; + + float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float1( + d_float1, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_float2( + d_float2, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, rows, cols); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, rows, cols); + + gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f); + gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f); + + typedef typename Tensor::DimensionPair DimPair; + Eigen::array dims; + gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast(); + gpu_res_half.device(sycl_device) = gpu_float1.cast().contract(gpu_float2.cast(), dims); + + Tensor half_prec(rows, cols); + Tensor full_prec(rows, cols); + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half)); + sycl_device.synchronize(); + + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl; + if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) { + VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j)); + } + } + } +} + +void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) { + std::cout << "Reducing " << size1 << " by " << size2 + << " tensor along dim " << redux << std::endl; + + int num_elem = size1*size2; + int result_size = (redux == 1 ? size1 : size2); + + float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size1, size2); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, result_size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, result_size); + + gpu_float.device(sycl_device) = gpu_float.random() * 2.0f; + + Eigen::array redux_dim = {redux}; + gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast(); + gpu_res_half.device(sycl_device) = gpu_float.cast().sum(redux_dim); + + Tensor half_prec(result_size); + Tensor full_prec(result_size); + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half)); + sycl_device.synchronize(); + + for (int i = 0; i < result_size; ++i) { + std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } +} + +void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) { + test_gpu_reductions(sycl_device, 13, 13, 0); + test_gpu_reductions(sycl_device, 13, 13, 1); + + test_gpu_reductions(sycl_device, 35, 36, 0); + test_gpu_reductions(sycl_device, 35, 36, 1); + + test_gpu_reductions(sycl_device, 36, 35, 0); + test_gpu_reductions(sycl_device, 36, 35, 1); +} + +void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) { + int size = 13; + int num_elem = size*size; + + float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half)); + Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float); + + gpu_float.device(sycl_device) = gpu_float.random(); + + gpu_res_float.device(sycl_device) = gpu_float.sum().cast(); + gpu_res_half.device(sycl_device) = gpu_float.cast().sum(); + + Tensor half_prec; + Tensor full_prec; + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + sycl_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); + + gpu_res_float.device(sycl_device) = gpu_float.maximum().cast(); + gpu_res_half.device(sycl_device) = gpu_float.cast().maximum(); + sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + sycl_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); +} + +void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) { + int num_elem = 101; + + float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half1( + d_res_half1, num_elem); + Eigen::TensorMap, Eigen::Unaligned> gpu_res_half2( + d_res_half2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + Eigen::array no_bcast; + no_bcast[0] = 1; + + gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(sycl_device) = gpu_float.abs(); + gpu_res_half1.device(sycl_device) = gpu_float.cast().abs().eval().cast(); + gpu_res_half2.device(sycl_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().cast(); + + Tensor half_prec1(num_elem); + Tensor half_prec2(num_elem); + Tensor full_prec(num_elem); + sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float)); + sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + sycl_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); + VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl) +{ + for (const auto& s : Eigen::get_sycl_supported_devices()) { + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + + CALL_SUBTEST_1(test_gpu_numext(sycl_device)); + CALL_SUBTEST_1(test_gpu_conversion(sycl_device)); + CALL_SUBTEST_1(test_gpu_unary(sycl_device)); + CALL_SUBTEST_1(test_gpu_elementwise(sycl_device)); + CALL_SUBTEST_1(test_gpu_trancendental(sycl_device)); + CALL_SUBTEST_2(test_gpu_contractions(sycl_device)); + CALL_SUBTEST_3(test_gpu_reductions(sycl_device)); + CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device)); + CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp index 727a9ffd7..9b16bb150 100644 --- a/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -152,6 +152,7 @@ template void sycl_padding_test_per_de EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_padding_test_per_device(device)); CALL_SUBTEST(sycl_padding_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp index 7f92bec78..1216ad26a 100644 --- a/unsupported/test/cxx11_tensor_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -244,6 +244,7 @@ template void sycl_tensor_patch_test_p EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_patch_test_per_device(device)); CALL_SUBTEST(sycl_tensor_patch_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp index 14a7c489c..dcdfdd3c0 100644 --- a/unsupported/test/cxx11_tensor_random_sycl.cpp +++ b/unsupported/test/cxx11_tensor_random_sycl.cpp @@ -78,6 +78,7 @@ template void sycl_random_test_per_dev EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_random_test_per_device(device)); CALL_SUBTEST(sycl_random_test_per_device(device)); #ifdef EIGEN_SYCL_DOUBLE_SUPPORT CALL_SUBTEST(sycl_random_test_per_device(device)); diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index dd30c235d..5ed007f70 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -248,6 +248,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { #ifdef EIGEN_SYCL_DOUBLE_SUPPORT CALL_SUBTEST_4(sycl_reverse_test_per_device(device)); #endif - CALL_SUBTEST_5(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_5(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_6(sycl_reverse_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index ca4e8b5ef..70c4116a1 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -112,6 +112,7 @@ void sycl_shuffling_test_per_device(dev_Selector s) { } EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_shuffling_test_per_device(device)); CALL_SUBTEST(sycl_shuffling_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp index 8d99a48ed..dc86dfee9 100644 --- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -217,6 +217,7 @@ test_entire_volume_patch_sycl(sycl_device); EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device(device)); } }