[SYCL-2020] Enabling half precision support for SYCL.

2025-07-30 16:52:01 +08:00 · 2023-04-27 14:01:11 +01:00 · 2023-04-27 14:01:11 +01:00 · ba47341a14
commit ba47341a14
parent 92a77a596b
23 changed files with 873 additions and 28 deletions
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@ -86,6 +86,8 @@ struct sycl_packet_traits : default_packet_traits {
    typedef packet_type half;                                              \
  };
 SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
 SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
@ -100,6 +102,7 @@ SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
  struct is_arithmetic<packet_type> { \
    enum { value = true };            \
  };
 SYCL_ARITHMETIC(cl::sycl::cl_half8)
 SYCL_ARITHMETIC(cl::sycl::cl_float4)
 SYCL_ARITHMETIC(cl::sycl::cl_double2)
 #undef SYCL_ARITHMETIC
@ -111,6 +114,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
    enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
    typedef packet_type half;                                            \
  };
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@ -38,6 +38,7 @@ namespace internal {
    return cl::sycl::log(a);                                           \
  }
 SYCL_PLOG(cl::sycl::cl_half8)
 SYCL_PLOG(cl::sycl::cl_float4)
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
@ -49,6 +50,7 @@ SYCL_PLOG(cl::sycl::cl_double2)
    return cl::sycl::log1p(a);                                           \
  }
 SYCL_PLOG1P(cl::sycl::cl_half8)
 SYCL_PLOG1P(cl::sycl::cl_float4)
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
@ -60,6 +62,7 @@ SYCL_PLOG1P(cl::sycl::cl_double2)
    return cl::sycl::log10(a);                                           \
  }
 SYCL_PLOG10(cl::sycl::cl_half8)
 SYCL_PLOG10(cl::sycl::cl_float4)
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
@ -71,6 +74,8 @@ SYCL_PLOG10(cl::sycl::cl_double2)
    return cl::sycl::exp(a);                                           \
  }
 SYCL_PEXP(cl::sycl::cl_half8)
 SYCL_PEXP(cl::sycl::cl_half)
 SYCL_PEXP(cl::sycl::cl_float4)
 SYCL_PEXP(cl::sycl::cl_float)
 SYCL_PEXP(cl::sycl::cl_double2)
@ -83,6 +88,7 @@ SYCL_PEXP(cl::sycl::cl_double2)
    return cl::sycl::expm1(a);                                           \
  }
 SYCL_PEXPM1(cl::sycl::cl_half8)
 SYCL_PEXPM1(cl::sycl::cl_float4)
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
@ -94,6 +100,7 @@ SYCL_PEXPM1(cl::sycl::cl_double2)
    return cl::sycl::sqrt(a);                                           \
  }
 SYCL_PSQRT(cl::sycl::cl_half8)
 SYCL_PSQRT(cl::sycl::cl_float4)
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
@ -105,6 +112,7 @@ SYCL_PSQRT(cl::sycl::cl_double2)
    return cl::sycl::rsqrt(a);                                           \
  }
 SYCL_PRSQRT(cl::sycl::cl_half8)
 SYCL_PRSQRT(cl::sycl::cl_float4)
 SYCL_PRSQRT(cl::sycl::cl_double2)
 #undef SYCL_PRSQRT
@ -117,6 +125,7 @@ SYCL_PRSQRT(cl::sycl::cl_double2)
    return cl::sycl::sin(a);                                           \
  }
 SYCL_PSIN(cl::sycl::cl_half8)
 SYCL_PSIN(cl::sycl::cl_float4)
 SYCL_PSIN(cl::sycl::cl_double2)
 #undef SYCL_PSIN
@ -129,6 +138,7 @@ SYCL_PSIN(cl::sycl::cl_double2)
    return cl::sycl::cos(a);                                           \
  }
 SYCL_PCOS(cl::sycl::cl_half8)
 SYCL_PCOS(cl::sycl::cl_float4)
 SYCL_PCOS(cl::sycl::cl_double2)
 #undef SYCL_PCOS
@ -141,6 +151,7 @@ SYCL_PCOS(cl::sycl::cl_double2)
    return cl::sycl::tan(a);                                           \
  }
 SYCL_PTAN(cl::sycl::cl_half8)
 SYCL_PTAN(cl::sycl::cl_float4)
 SYCL_PTAN(cl::sycl::cl_double2)
 #undef SYCL_PTAN
@ -153,6 +164,7 @@ SYCL_PTAN(cl::sycl::cl_double2)
    return cl::sycl::asin(a);                                           \
  }
 SYCL_PASIN(cl::sycl::cl_half8)
 SYCL_PASIN(cl::sycl::cl_float4)
 SYCL_PASIN(cl::sycl::cl_double2)
 #undef SYCL_PASIN
@ -165,6 +177,7 @@ SYCL_PASIN(cl::sycl::cl_double2)
    return cl::sycl::acos(a);                                           \
  }
 SYCL_PACOS(cl::sycl::cl_half8)
 SYCL_PACOS(cl::sycl::cl_float4)
 SYCL_PACOS(cl::sycl::cl_double2)
 #undef SYCL_PACOS
@ -177,6 +190,7 @@ SYCL_PACOS(cl::sycl::cl_double2)
    return cl::sycl::atan(a);                                           \
  }
 SYCL_PATAN(cl::sycl::cl_half8)
 SYCL_PATAN(cl::sycl::cl_float4)
 SYCL_PATAN(cl::sycl::cl_double2)
 #undef SYCL_PATAN
@ -189,6 +203,7 @@ SYCL_PATAN(cl::sycl::cl_double2)
    return cl::sycl::sinh(a);                                           \
  }
 SYCL_PSINH(cl::sycl::cl_half8)
 SYCL_PSINH(cl::sycl::cl_float4)
 SYCL_PSINH(cl::sycl::cl_double2)
 #undef SYCL_PSINH
@ -201,6 +216,7 @@ SYCL_PSINH(cl::sycl::cl_double2)
    return cl::sycl::cosh(a);                                           \
  }
 SYCL_PCOSH(cl::sycl::cl_half8)
 SYCL_PCOSH(cl::sycl::cl_float4)
 SYCL_PCOSH(cl::sycl::cl_double2)
 #undef SYCL_PCOSH
@ -213,6 +229,7 @@ SYCL_PCOSH(cl::sycl::cl_double2)
    return cl::sycl::tanh(a);                                           \
  }
 SYCL_PTANH(cl::sycl::cl_half8)
 SYCL_PTANH(cl::sycl::cl_float4)
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
@ -224,6 +241,7 @@ SYCL_PTANH(cl::sycl::cl_double2)
    return cl::sycl::ceil(a);                                           \
  }
 SYCL_PCEIL(cl::sycl::cl_half)
 SYCL_PCEIL(cl::sycl::cl_float4)
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
@ -235,6 +253,7 @@ SYCL_PCEIL(cl::sycl::cl_double2)
    return cl::sycl::round(a);                                           \
  }
 SYCL_PROUND(cl::sycl::cl_half8)
 SYCL_PROUND(cl::sycl::cl_float4)
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
@ -246,6 +265,7 @@ SYCL_PROUND(cl::sycl::cl_double2)
    return cl::sycl::rint(a);                                           \
  }
 SYCL_PRINT(cl::sycl::cl_half8)
 SYCL_PRINT(cl::sycl::cl_float4)
 SYCL_PRINT(cl::sycl::cl_double2)
 #undef SYCL_PRINT
@ -257,6 +277,7 @@ SYCL_PRINT(cl::sycl::cl_double2)
    return cl::sycl::floor(a);                                           \
  }
 SYCL_FLOOR(cl::sycl::cl_half8)
 SYCL_FLOOR(cl::sycl::cl_float4)
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
@ -268,6 +289,7 @@ SYCL_FLOOR(cl::sycl::cl_double2)
    return expr;                                                       \
  }
 SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
@ -279,6 +301,7 @@ SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
    return expr;                                                       \
  }
 SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
@ -292,6 +315,7 @@ SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
                                     cl::sycl::rounding_mode::automatic>()); \
  }
 SYCL_PLDEXP(cl::sycl::cl_half8)
 SYCL_PLDEXP(cl::sycl::cl_float4)
 SYCL_PLDEXP(cl::sycl::cl_double2)
 #undef SYCL_PLDEXP
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@ -44,9 +44,34 @@ SYCL_PLOAD(cl::sycl::cl_float4, u)
 SYCL_PLOAD(cl::sycl::cl_float4, )
 SYCL_PLOAD(cl::sycl::cl_double2, u)
 SYCL_PLOAD(cl::sycl::cl_double2, )
 #undef SYCL_PLOAD
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
    pload<cl::sycl::cl_half8>(
        const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
  auto ptr = cl::sycl::address_space_cast<
      cl::sycl::access::address_space::generic_space,
      cl::sycl::access::decorated::no>(
      reinterpret_cast<const cl::sycl::cl_half*>(from));
  cl::sycl::cl_half8 res{};
  res.load(0, ptr);
  return res;
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
 ploadu<cl::sycl::cl_half8>(
    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
  auto ptr = cl::sycl::address_space_cast<
      cl::sycl::access::address_space::generic_space,
      cl::sycl::access::decorated::no>(
      reinterpret_cast<const cl::sycl::cl_half*>(from));
  cl::sycl::cl_half8 res{};
  res.load(0, ptr);
  return res;
 }
 #define SYCL_PSTORE(scalar, packet_type, alignment)             \
  template <>                                                   \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
@ -59,9 +84,28 @@ SYCL_PSTORE(float, cl::sycl::cl_float4, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, u)
 #undef SYCL_PSTORE
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
    Eigen::half* to, const cl::sycl::cl_half8& from) {
  auto ptr = cl::sycl::address_space_cast<
      cl::sycl::access::address_space::generic_space,
      cl::sycl::access::decorated::no>(
      reinterpret_cast<cl::sycl::cl_half*>(to));
  from.store(0, ptr);
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
    Eigen::half* to, const cl::sycl::cl_half8& from) {
  auto ptr = cl::sycl::address_space_cast<
      cl::sycl::access::address_space::generic_space,
      cl::sycl::access::decorated::no>(
      reinterpret_cast<cl::sycl::cl_half*>(to));
  from.store(0, ptr);
 }
 #define SYCL_PSET1(packet_type)                                         \
  template <>                                                           \
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@ -70,6 +114,7 @@ SYCL_PSTORE(double, cl::sycl::cl_double2, u)
  }
 // global space
 SYCL_PSET1(cl::sycl::cl_half8)
 SYCL_PSET1(cl::sycl::cl_float4)
 SYCL_PSET1(cl::sycl::cl_double2)
@ -86,6 +131,58 @@ struct get_base_packet {
  get_pgather(sycl_multi_pointer, Index) {}
 };
 template <>
 struct get_base_packet<cl::sycl::cl_half8> {
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
      sycl_multi_pointer from) {
    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
                              static_cast<cl::sycl::half>(from[0]),
                              static_cast<cl::sycl::half>(from[1]),
                              static_cast<cl::sycl::half>(from[1]),
                              static_cast<cl::sycl::half>(from[2]),
                              static_cast<cl::sycl::half>(from[2]),
                              static_cast<cl::sycl::half>(from[3]),
                              static_cast<cl::sycl::half>(from[3]));
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
      sycl_multi_pointer from, Index stride) {
    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
                              static_cast<cl::sycl::half>(from[1 * stride]),
                              static_cast<cl::sycl::half>(from[2 * stride]),
                              static_cast<cl::sycl::half>(from[3 * stride]),
                              static_cast<cl::sycl::half>(from[4 * stride]),
                              static_cast<cl::sycl::half>(from[5 * stride]),
                              static_cast<cl::sycl::half>(from[6 * stride]),
                              static_cast<cl::sycl::half>(from[7 * stride]));
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
      sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
    auto tmp = stride;
    to[0] = Eigen::half(from.s0());
    to[tmp] = Eigen::half(from.s1());
    to[tmp += stride] = Eigen::half(from.s2());
    to[tmp += stride] = Eigen::half(from.s3());
    to[tmp += stride] = Eigen::half(from.s4());
    to[tmp += stride] = Eigen::half(from.s5());
    to[tmp += stride] = Eigen::half(from.s6());
    to[tmp += stride] = Eigen::half(from.s7());
  }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
      const cl::sycl::half& a) {
    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
                              static_cast<cl::sycl::half>(a + 2),
                              static_cast<cl::sycl::half>(a + 3),
                              static_cast<cl::sycl::half>(a + 4),
                              static_cast<cl::sycl::half>(a + 5),
                              static_cast<cl::sycl::half>(a + 6),
                              static_cast<cl::sycl::half>(a + 7));
  }
 };
 template <>
 struct get_base_packet<cl::sycl::cl_float4> {
  template <typename sycl_multi_pointer>
@ -152,6 +249,7 @@ struct get_base_packet<cl::sycl::cl_double2> {
    return get_base_packet<packet_type>::get_ploaddup(from);               \
  }
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
@ -165,9 +263,14 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
  }
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
 #undef SYCL_PLSET
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
    const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
 }
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
  template <>                                                                  \
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@ -176,9 +279,9 @@ SYCL_PLSET(cl::sycl::cl_double2)
    return get_base_packet<packet_type>::get_pgather(from, stride);            \
  }
 SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PGATHER_SPECILIZE
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
@ -189,6 +292,7 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
  }
 SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
@ -201,10 +305,16 @@ SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
    return cl::sycl::mad(a, b, c);                                        \
  }
 SYCL_PMAD(cl::sycl::cl_half8)
 SYCL_PMAD(cl::sycl::cl_float4)
 SYCL_PMAD(cl::sycl::cl_double2)
 #undef SYCL_PMAD
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
    const cl::sycl::cl_half8& a) {
  return Eigen::half(a.s0());
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
    const cl::sycl::cl_float4& a) {
@ -216,6 +326,13 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
  return a.x();
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
    const cl::sycl::cl_half8& a) {
  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
                     + a.s6() + a.s7());
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
    const cl::sycl::cl_float4& a) {
@ -228,6 +345,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
  return a.x() + a.y();
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
    const cl::sycl::cl_half8& a) {
  return Eigen::half(cl::sycl::fmax(
          cl::sycl::fmax(
            cl::sycl::fmax(a.s0(), a.s1()),
            cl::sycl::fmax(a.s2(), a.s3())),
          cl::sycl::fmax(
            cl::sycl::fmax(a.s4(), a.s5()),
            cl::sycl::fmax(a.s6(), a.s7()))));
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
    const cl::sycl::cl_float4& a) {
@ -240,6 +368,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
  return cl::sycl::fmax(a.x(), a.y());
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
    const cl::sycl::cl_half8& a) {
  return Eigen::half(cl::sycl::fmin(
      cl::sycl::fmin(
          cl::sycl::fmin(a.s0(), a.s1()),
          cl::sycl::fmin(a.s2(), a.s3())),
      cl::sycl::fmin(
          cl::sycl::fmin(a.s4(), a.s5()),
          cl::sycl::fmin(a.s6(), a.s7()))));
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
    const cl::sycl::cl_float4& a) {
@ -252,6 +391,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
  return cl::sycl::fmin(a.x(), a.y());
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
    const cl::sycl::cl_half8& a) {
  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
                     a.s6() * a.s7());
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
    const cl::sycl::cl_float4& a) {
@ -263,6 +408,14 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
  return a.x() * a.y();
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
 pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
                            cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
                            cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
                            cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
@ -300,6 +453,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
    return sycl_pcmp_##OP<TYPE>(a, b);                                         \
  }
 SYCL_PCMP(le, cl::sycl::cl_half8)
 SYCL_PCMP(lt, cl::sycl::cl_half8)
 SYCL_PCMP(eq, cl::sycl::cl_half8)
 SYCL_PCMP(le, cl::sycl::cl_float4)
 SYCL_PCMP(lt, cl::sycl::cl_float4)
 SYCL_PCMP(eq, cl::sycl::cl_float4)
@ -308,6 +464,121 @@ SYCL_PCMP(lt, cl::sycl::cl_double2)
 SYCL_PCMP(eq, cl::sycl::cl_double2)
 #undef SYCL_PCMP
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
    PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
  cl::sycl::cl_half tmp = kernel.packet[0].s1();
  kernel.packet[0].s1() = kernel.packet[1].s0();
  kernel.packet[1].s0() = tmp;
  tmp = kernel.packet[0].s2();
  kernel.packet[0].s2() = kernel.packet[2].s0();
  kernel.packet[2].s0() = tmp;
  tmp = kernel.packet[0].s3();
  kernel.packet[0].s3() = kernel.packet[3].s0();
  kernel.packet[3].s0() = tmp;
  tmp = kernel.packet[0].s4();
  kernel.packet[0].s4() = kernel.packet[4].s0();
  kernel.packet[4].s0() = tmp;
  tmp = kernel.packet[0].s5();
  kernel.packet[0].s5() = kernel.packet[5].s0();
  kernel.packet[5].s0() = tmp;
  tmp = kernel.packet[0].s6();
  kernel.packet[0].s6() = kernel.packet[6].s0();
  kernel.packet[6].s0() = tmp;
  tmp = kernel.packet[0].s7();
  kernel.packet[0].s7() = kernel.packet[7].s0();
  kernel.packet[7].s0() = tmp;
  tmp = kernel.packet[1].s2();
  kernel.packet[1].s2() = kernel.packet[2].s1();
  kernel.packet[2].s1() = tmp;
  tmp = kernel.packet[1].s3();
  kernel.packet[1].s3() = kernel.packet[3].s1();
  kernel.packet[3].s1() = tmp;
  tmp = kernel.packet[1].s4();
  kernel.packet[1].s4() = kernel.packet[4].s1();
  kernel.packet[4].s1() = tmp;
  tmp = kernel.packet[1].s5();
  kernel.packet[1].s5() = kernel.packet[5].s1();
  kernel.packet[5].s1() = tmp;
  tmp = kernel.packet[1].s6();
  kernel.packet[1].s6() = kernel.packet[6].s1();
  kernel.packet[6].s1() = tmp;
  tmp = kernel.packet[1].s7();
  kernel.packet[1].s7() = kernel.packet[7].s1();
  kernel.packet[7].s1() = tmp;
  tmp = kernel.packet[2].s3();
  kernel.packet[2].s3() = kernel.packet[3].s2();
  kernel.packet[3].s2() = tmp;
  tmp = kernel.packet[2].s4();
  kernel.packet[2].s4() = kernel.packet[4].s2();
  kernel.packet[4].s2() = tmp;
  tmp = kernel.packet[2].s5();
  kernel.packet[2].s5() = kernel.packet[5].s2();
  kernel.packet[5].s2() = tmp;
  tmp = kernel.packet[2].s6();
  kernel.packet[2].s6() = kernel.packet[6].s2();
  kernel.packet[6].s2() = tmp;
  tmp = kernel.packet[2].s7();
  kernel.packet[2].s7() = kernel.packet[7].s2();
  kernel.packet[7].s2() = tmp;
  tmp = kernel.packet[3].s4();
  kernel.packet[3].s4() = kernel.packet[4].s3();
  kernel.packet[4].s3() = tmp;
  tmp = kernel.packet[3].s5();
  kernel.packet[3].s5() = kernel.packet[5].s3();
  kernel.packet[5].s3() = tmp;
  tmp = kernel.packet[3].s6();
  kernel.packet[3].s6() = kernel.packet[6].s3();
  kernel.packet[6].s3() = tmp;
  tmp = kernel.packet[3].s7();
  kernel.packet[3].s7() = kernel.packet[7].s3();
  kernel.packet[7].s3() = tmp;
  tmp = kernel.packet[4].s5();
  kernel.packet[4].s5() = kernel.packet[5].s4();
  kernel.packet[5].s4() = tmp;
  tmp = kernel.packet[4].s6();
  kernel.packet[4].s6() = kernel.packet[6].s4();
  kernel.packet[6].s4() = tmp;
  tmp = kernel.packet[4].s7();
  kernel.packet[4].s7() = kernel.packet[7].s4();
  kernel.packet[7].s4() = tmp;
  tmp = kernel.packet[5].s6();
  kernel.packet[5].s6() = kernel.packet[6].s5();
  kernel.packet[6].s5() = tmp;
  tmp = kernel.packet[5].s7();
  kernel.packet[5].s7() = kernel.packet[7].s5();
  kernel.packet[7].s5() = tmp;
  tmp = kernel.packet[6].s7();
  kernel.packet[6].s7() = kernel.packet[7].s6();
  kernel.packet[7].s6() = tmp;
 }
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
    PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
  float tmp = kernel.packet[0].y();
@ -342,6 +613,19 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
  kernel.packet[1].x() = tmp;
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
    const cl::sycl::cl_half8& thenPacket,
    const cl::sycl::cl_half8& elsePacket) {
  cl::sycl::cl_short8 condition(
      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
      ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
      ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
  return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -1118,7 +1118,7 @@ struct GeneralVectorTensor {
                                              : globalContractDimOffset + privateOffsetC)
                             : OutScalar(0);
-        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
        privateOffsetNC += Properties::LocalThreadSizeNC;
      }
      privateOffsetC += Properties::LocalThreadSizeC;
@ -1263,7 +1263,7 @@ struct GeneralScalarContraction {
    StorageIndex localid = itemID.get_local_id(0);
    OutScalar accumulator = OutScalar(0);
    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+      accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
    }
    auto out_scratch_ptr = scratch_ptr + localid;
    *out_scratch_ptr = accumulator;
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@ -151,6 +151,7 @@ if(EIGEN_TEST_SYCL)
  ei_add_test(cxx11_tensor_argmax_sycl)
  ei_add_test(cxx11_tensor_custom_op_sycl)
  ei_add_test(cxx11_tensor_scan_sycl)
  ei_add_test(cxx11_tensor_of_float16_sycl)
  set(EIGEN_SYCL OFF)
 endif()
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@ -32,9 +32,9 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
  Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
  Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
  in.setRandom();
-  in *= in.constant(100.0);
+  in *= in.constant(static_cast<DataType>(100.0));
-  in(0, 0, 0) = -1000.0;
+  in(0, 0, 0) = static_cast<DataType>(-1000.0);
-  in(1, 1, 1) = 1000.0;
+  in(1, 1, 1) = static_cast<DataType>(1000.0);
  std::size_t in_bytes = in.size() * sizeof(DataType);
  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
@ -93,7 +93,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
            // = 10.0
-            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
          }
        }
      }
@ -132,7 +132,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
          }
        }
      }
@ -180,7 +180,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
-            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
          }
        }
      }
@ -219,7 +219,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
            ix[2] = k;
            ix[3] = l;
            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
          }
        }
      }
@ -252,6 +252,7 @@ void sycl_argmax_test_per_device(const Device_Selector& d) {
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
  for (const auto& device : Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
    CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@ -139,6 +139,7 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
 EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
    CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@ -28,26 +28,135 @@ using Eigen::TensorMap;
 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
 namespace SYCL {
-template <typename T> T abs(T x) { 
+
 template <typename T> T abs(T x) {
  return cl::sycl::abs(x);
 }
 template <> Eigen::half abs(Eigen::half x) {
  return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
 }
-template <> float abs(float x) { 
+template <> float abs(float x) {
  return cl::sycl::fabs(x);
 }
-template <> double abs(double x) { 
+template <> double abs(double x) {
  return cl::sycl::fabs(x);
 }
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { 
+template <typename T> T cwiseMax(T x, T y) {
-return cl::sycl::max(x, y);
+  return cl::sycl::max(x, y);
 }
-template <typename T> T cwiseMin(T x, T y) { 
+template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
-  return cl::sycl::min(x, y); 
+return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
 }
 template <typename T> T cwiseMin(T x, T y) {
  return cl::sycl::min(x, y);
 }
 template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
  return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
 }
 template <typename T> T sqrt(T x) {
  return cl::sycl::sqrt(x);
 }
 template <> Eigen::half sqrt(Eigen::half x) {
  return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T rsqrt(T x) {
  return cl::sycl::rsqrt(x);
 }
 template <> Eigen::half rsqrt(Eigen::half x) {
  return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T tanh(T x) {
  return cl::sycl::tanh(x);
 }
 template <> Eigen::half tanh(Eigen::half x) {
  return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T exp(T x) {
  return cl::sycl::exp(x);
 }
 template <> Eigen::half exp(Eigen::half x) {
  return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T expm1(T x) {
  return cl::sycl::expm1(x);
 }
 template <> Eigen::half expm1(Eigen::half x) {
  return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T log(T x) {
  return cl::sycl::log(x);
 }
 template <> Eigen::half log(Eigen::half x) {
  return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T ceil(T x) {
  return cl::sycl::ceil(x);
 }
 template <> Eigen::half ceil(Eigen::half x) {
  return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T floor(T x) {
  return cl::sycl::floor(x);
 }
 template <> Eigen::half floor(Eigen::half x) {
  return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T round(T x) {
  return cl::sycl::round(x);
 }
 template <> Eigen::half round(Eigen::half x) {
  return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T log1p(T x) {
  return cl::sycl::log1p(x);
 }
 template <> Eigen::half log1p(Eigen::half x) {
  return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T sign(T x) {
  return cl::sycl::sign(x);
 }
 template <> Eigen::half sign(Eigen::half x) {
  return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T isnan(T x) {
  return cl::sycl::isnan(x);
 }
 template <> Eigen::half isnan(Eigen::half x) {
  return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T isfinite(T x) {
  return cl::sycl::isfinite(x);
 }
 template <> Eigen::half isfinite(Eigen::half x) {
  return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
 }
 template <typename T> T isinf(T x) {
  return cl::sycl::isinf(x);
 }
 template <> Eigen::half isinf(Eigen::half x) {
  return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
 }
 }
@ -157,15 +266,14 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
 #define DECLARE_UNARY_STRUCT(FUNC)                                 \
  struct op_##FUNC {                                               \
    template <typename T>                                          \
-    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
+    auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) {   \
-      return cl::sycl::FUNC(x);                                    \
+      return SYCL::FUNC(x);                                    \
    }                                                              \
    template <typename T>                                          \
    auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
      return x.FUNC();                                             \
    }                                                              \
-  };
+};
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
@ -390,8 +498,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    QueueInterface queueInterface(device);
    Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
-    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
-    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+    CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
    CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
    CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
  }
 }
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@ -619,5 +619,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
    CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@ -175,6 +175,7 @@ template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(tensorConcat_perDevice<half>(device));
    CALL_SUBTEST(tensorConcat_perDevice<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -82,6 +82,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_device_test_per_device<half>(device));
    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
    CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
  }
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@ -95,6 +95,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
 EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) { 
    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
   CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
   CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@ -131,6 +131,7 @@ template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_
 EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
    CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@ -121,6 +121,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_
 EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@ -100,6 +100,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
 EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@ -381,6 +381,7 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
    CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
@ -0,0 +1,406 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2023
 // Alejandro Acosta  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 // Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_SYCL
 #define EIGEN_SYCL_HALF_SUPPORT
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 using Eigen::Tensor;
 using Eigen::SyclDevice;
 void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
  bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
  gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
  Tensor<bool, 1> half_prec(num_elem);
  Tensor<bool, 1> full_prec(num_elem);
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking numext " << i << std::endl;
    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
  }
 }
 void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
  float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
      d_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
      d_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
      d_conv, num_elem);
  gpu_float.device(sycl_device) = gpu_float.random();
  gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
  gpu_conv.device(sycl_device) = gpu_half.cast<float>();
  Tensor<float, 1> initial(num_elem);
  Tensor<float, 1> final(num_elem);
  sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
  for (int i = 0; i < num_elem; ++i) {
    VERIFY_IS_APPROX(initial(i), final(i));
  }
 }
 void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
      d_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
      d_res_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
      d_res_float, num_elem);
  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
  gpu_res_float.device(sycl_device) = gpu_float.abs();
  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
  Tensor<float, 1> half_prec(num_elem);
  Tensor<float, 1> full_prec(num_elem);
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
  sycl_device.synchronize();
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking unary " << i << std::endl;
    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
  }
 }
 void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
  gpu_float1.device(sycl_device) = gpu_float1.random();
  gpu_float2.device(sycl_device) = gpu_float2.random();
  gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
  gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
  Tensor<float, 1> half_prec(num_elem);
  Tensor<float, 1> full_prec(num_elem);
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
  }
 }
 void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
  gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
  gpu_float3.device(sycl_device) = gpu_float3.random();
  gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
  gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
  gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
  gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
  gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
  gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
  gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
  gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
  gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
  gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
  Tensor<float, 1> input1(num_elem);
  Tensor<Eigen::half, 1> half_prec1(num_elem);
  Tensor<Eigen::half, 1> full_prec1(num_elem);
  Tensor<float, 1> input2(num_elem);
  Tensor<Eigen::half, 1> half_prec2(num_elem);
  Tensor<Eigen::half, 1> full_prec2(num_elem);
  Tensor<float, 1> input3(num_elem);
  Tensor<Eigen::half, 1> half_prec3(num_elem);
  Tensor<Eigen::half, 1> full_prec3(num_elem);
  sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
  sycl_device.synchronize();
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
  }
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
    else
      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
  }
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
  }
 }
 void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
  int rows = 23;
  int cols = 23;
  int num_elem = rows*cols;
  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
      d_float1, rows, cols);
  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
      d_float2, rows, cols);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
      d_res_half, rows, cols);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
      d_res_float, rows, cols);
  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
  gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
  typedef typename Tensor<float, 2>::DimensionPair DimPair;
  Eigen::array<DimPair, 1> dims;
  gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
  gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
  Tensor<Eigen::half, 2> half_prec(rows, cols);
  Tensor<Eigen::half, 2> full_prec(rows, cols);
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
  sycl_device.synchronize();
  for (int i = 0; i < rows; ++i) {
    for (int j = 0; j < cols; ++j) {
      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
      }
    }
  }
 }
 void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
   std::cout << "Reducing " << size1 << " by " << size2
             << " tensor along dim " << redux << std::endl;
  int num_elem = size1*size2;
  int result_size = (redux == 1 ? size1 : size2);
  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
      d_float, size1, size2);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
      d_res_half, result_size);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
      d_res_float, result_size);
  gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
  Eigen::array<int, 1> redux_dim = {redux};
  gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
  Tensor<Eigen::half, 1> half_prec(result_size);
  Tensor<Eigen::half, 1> full_prec(result_size);
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
  sycl_device.synchronize();
  for (int i = 0; i < result_size; ++i) {
    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
  }
 }
 void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
  test_gpu_reductions(sycl_device, 13, 13, 0);
  test_gpu_reductions(sycl_device, 13, 13, 1);
  test_gpu_reductions(sycl_device, 35, 36, 0);
  test_gpu_reductions(sycl_device, 35, 36, 1);
  test_gpu_reductions(sycl_device, 36, 35, 0);
  test_gpu_reductions(sycl_device, 36, 35, 1);
 }
 void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
  int size = 13;
  int num_elem = size*size;
  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
      d_float, size, size);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
      d_res_half);
  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
      d_res_float);
  gpu_float.device(sycl_device) = gpu_float.random();
  gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
  Tensor<Eigen::half, 0> half_prec;
  Tensor<Eigen::half, 0> full_prec;
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
  sycl_device.synchronize();
  VERIFY_IS_APPROX(full_prec(), half_prec());
  gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
  sycl_device.synchronize();
  VERIFY_IS_APPROX(full_prec(), half_prec());
 }
 void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
  int num_elem = 101;
  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
      d_float, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
      d_res_half1, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
      d_res_half2, num_elem);
  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
      d_res_float, num_elem);
  Eigen::array<int, 1> no_bcast;
  no_bcast[0] = 1;
  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
  gpu_res_float.device(sycl_device) = gpu_float.abs();
  gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
  gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
  Tensor<float, 1> half_prec1(num_elem);
  Tensor<float, 1> half_prec2(num_elem);
  Tensor<float, 1> full_prec(num_elem);
  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
  sycl_device.synchronize();
  for (int i = 0; i < num_elem; ++i) {
    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
  }
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
 {
  for (const auto& s : Eigen::get_sycl_supported_devices()) {
    QueueInterface queueInterface(s);
    auto sycl_device = Eigen::SyclDevice(&queueInterface);
    CALL_SUBTEST_1(test_gpu_numext(sycl_device));
    CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
    CALL_SUBTEST_1(test_gpu_unary(sycl_device));
    CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
    CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
    CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
    CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
    CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
    CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
  }
 }
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@ -152,6 +152,7 @@ template<typename DataType, typename dev_Selector> void sycl_padding_test_per_de
 EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
    CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@ -244,6 +244,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_p
 EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
    CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@ -78,6 +78,7 @@ template<typename DataType, typename dev_Selector> void sycl_random_test_per_dev
 EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
 {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_random_test_per_device<half>(device));
    CALL_SUBTEST(sycl_random_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST(sycl_random_test_per_device<double>(device));
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@ -248,6 +248,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
 #endif
-    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
    CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@ -112,6 +112,7 @@ void sycl_shuffling_test_per_device(dev_Selector s) {
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
  for (const auto& device : Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
    CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
  }
 }
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@ -217,6 +217,7 @@ test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
 EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
 {
 for (const auto& device :Eigen::get_sycl_supported_devices()) {
  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
 }
 }