From ba47341a14d3c883307548d19a095c066fbb9830 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro@magicpony.technology>
Date: Thu, 27 Apr 2023 14:01:11 +0100
Subject: [PATCH] [SYCL-2020] Enabling half precision support for SYCL.

---
 Eigen/src/Core/arch/SYCL/InteropHeaders.h     |   4 +
 Eigen/src/Core/arch/SYCL/MathFunctions.h      |  24 ++
 Eigen/src/Core/arch/SYCL/PacketMath.h         | 292 ++++++++++++-
 .../CXX11/src/Tensor/TensorContractionSycl.h  |   4 +-
 unsupported/test/CMakeLists.txt               |   1 +
 unsupported/test/cxx11_tensor_argmax_sycl.cpp |  15 +-
 .../test/cxx11_tensor_broadcast_sycl.cpp      |   1 +
 .../test/cxx11_tensor_builtins_sycl.cpp       | 138 +++++-
 .../test/cxx11_tensor_chipping_sycl.cpp       |   1 +
 .../test/cxx11_tensor_concatenation_sycl.cpp  |   1 +
 unsupported/test/cxx11_tensor_device_sycl.cpp |   1 +
 .../test/cxx11_tensor_image_op_sycl.cpp       |   1 +
 .../test/cxx11_tensor_inflation_sycl.cpp      |   1 +
 .../test/cxx11_tensor_layout_swap_sycl.cpp    |   1 +
 unsupported/test/cxx11_tensor_math_sycl.cpp   |   1 +
 .../test/cxx11_tensor_morphing_sycl.cpp       |   1 +
 .../test/cxx11_tensor_of_float16_sycl.cpp     | 406 ++++++++++++++++++
 .../test/cxx11_tensor_padding_sycl.cpp        |   1 +
 unsupported/test/cxx11_tensor_patch_sycl.cpp  |   1 +
 unsupported/test/cxx11_tensor_random_sycl.cpp |   1 +
 .../test/cxx11_tensor_reverse_sycl.cpp        |   3 +-
 .../test/cxx11_tensor_shuffling_sycl.cpp      |   1 +
 .../test/cxx11_tensor_volume_patch_sycl.cpp   |   1 +
 23 files changed, 873 insertions(+), 28 deletions(-)
 create mode 100644 unsupported/test/cxx11_tensor_of_float16_sycl.cpp

diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index ed4edc11e..27d9a82d3 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -86,6 +86,8 @@ struct sycl_packet_traits : default_packet_traits {
     typedef packet_type half;                                              \
   };
 
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
@@ -100,6 +102,7 @@ SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
   struct is_arithmetic<packet_type> { \
     enum { value = true };            \
   };
+SYCL_ARITHMETIC(cl::sycl::cl_half8)
 SYCL_ARITHMETIC(cl::sycl::cl_float4)
 SYCL_ARITHMETIC(cl::sycl::cl_double2)
 #undef SYCL_ARITHMETIC
@@ -111,6 +114,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
     enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
     typedef packet_type half;                                            \
   };
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
 
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index bdbb21f9f..a8adc46c5 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -38,6 +38,7 @@ namespace internal {
     return cl::sycl::log(a);                                           \
   }
 
+SYCL_PLOG(cl::sycl::cl_half8)
 SYCL_PLOG(cl::sycl::cl_float4)
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
@@ -49,6 +50,7 @@ SYCL_PLOG(cl::sycl::cl_double2)
     return cl::sycl::log1p(a);                                           \
   }
 
+SYCL_PLOG1P(cl::sycl::cl_half8)
 SYCL_PLOG1P(cl::sycl::cl_float4)
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
@@ -60,6 +62,7 @@ SYCL_PLOG1P(cl::sycl::cl_double2)
     return cl::sycl::log10(a);                                           \
   }
 
+SYCL_PLOG10(cl::sycl::cl_half8)
 SYCL_PLOG10(cl::sycl::cl_float4)
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
@@ -71,6 +74,8 @@ SYCL_PLOG10(cl::sycl::cl_double2)
     return cl::sycl::exp(a);                                           \
   }
 
+SYCL_PEXP(cl::sycl::cl_half8)
+SYCL_PEXP(cl::sycl::cl_half)
 SYCL_PEXP(cl::sycl::cl_float4)
 SYCL_PEXP(cl::sycl::cl_float)
 SYCL_PEXP(cl::sycl::cl_double2)
@@ -83,6 +88,7 @@ SYCL_PEXP(cl::sycl::cl_double2)
     return cl::sycl::expm1(a);                                           \
   }
 
+SYCL_PEXPM1(cl::sycl::cl_half8)
 SYCL_PEXPM1(cl::sycl::cl_float4)
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
@@ -94,6 +100,7 @@ SYCL_PEXPM1(cl::sycl::cl_double2)
     return cl::sycl::sqrt(a);                                           \
   }
 
+SYCL_PSQRT(cl::sycl::cl_half8)
 SYCL_PSQRT(cl::sycl::cl_float4)
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
@@ -105,6 +112,7 @@ SYCL_PSQRT(cl::sycl::cl_double2)
     return cl::sycl::rsqrt(a);                                           \
   }
 
+SYCL_PRSQRT(cl::sycl::cl_half8)
 SYCL_PRSQRT(cl::sycl::cl_float4)
 SYCL_PRSQRT(cl::sycl::cl_double2)
 #undef SYCL_PRSQRT
@@ -117,6 +125,7 @@ SYCL_PRSQRT(cl::sycl::cl_double2)
     return cl::sycl::sin(a);                                           \
   }
 
+SYCL_PSIN(cl::sycl::cl_half8)
 SYCL_PSIN(cl::sycl::cl_float4)
 SYCL_PSIN(cl::sycl::cl_double2)
 #undef SYCL_PSIN
@@ -129,6 +138,7 @@ SYCL_PSIN(cl::sycl::cl_double2)
     return cl::sycl::cos(a);                                           \
   }
 
+SYCL_PCOS(cl::sycl::cl_half8)
 SYCL_PCOS(cl::sycl::cl_float4)
 SYCL_PCOS(cl::sycl::cl_double2)
 #undef SYCL_PCOS
@@ -141,6 +151,7 @@ SYCL_PCOS(cl::sycl::cl_double2)
     return cl::sycl::tan(a);                                           \
   }
 
+SYCL_PTAN(cl::sycl::cl_half8)
 SYCL_PTAN(cl::sycl::cl_float4)
 SYCL_PTAN(cl::sycl::cl_double2)
 #undef SYCL_PTAN
@@ -153,6 +164,7 @@ SYCL_PTAN(cl::sycl::cl_double2)
     return cl::sycl::asin(a);                                           \
   }
 
+SYCL_PASIN(cl::sycl::cl_half8)
 SYCL_PASIN(cl::sycl::cl_float4)
 SYCL_PASIN(cl::sycl::cl_double2)
 #undef SYCL_PASIN
@@ -165,6 +177,7 @@ SYCL_PASIN(cl::sycl::cl_double2)
     return cl::sycl::acos(a);                                           \
   }
 
+SYCL_PACOS(cl::sycl::cl_half8)
 SYCL_PACOS(cl::sycl::cl_float4)
 SYCL_PACOS(cl::sycl::cl_double2)
 #undef SYCL_PACOS
@@ -177,6 +190,7 @@ SYCL_PACOS(cl::sycl::cl_double2)
     return cl::sycl::atan(a);                                           \
   }
 
+SYCL_PATAN(cl::sycl::cl_half8)
 SYCL_PATAN(cl::sycl::cl_float4)
 SYCL_PATAN(cl::sycl::cl_double2)
 #undef SYCL_PATAN
@@ -189,6 +203,7 @@ SYCL_PATAN(cl::sycl::cl_double2)
     return cl::sycl::sinh(a);                                           \
   }
 
+SYCL_PSINH(cl::sycl::cl_half8)
 SYCL_PSINH(cl::sycl::cl_float4)
 SYCL_PSINH(cl::sycl::cl_double2)
 #undef SYCL_PSINH
@@ -201,6 +216,7 @@ SYCL_PSINH(cl::sycl::cl_double2)
     return cl::sycl::cosh(a);                                           \
   }
 
+SYCL_PCOSH(cl::sycl::cl_half8)
 SYCL_PCOSH(cl::sycl::cl_float4)
 SYCL_PCOSH(cl::sycl::cl_double2)
 #undef SYCL_PCOSH
@@ -213,6 +229,7 @@ SYCL_PCOSH(cl::sycl::cl_double2)
     return cl::sycl::tanh(a);                                           \
   }
 
+SYCL_PTANH(cl::sycl::cl_half8)
 SYCL_PTANH(cl::sycl::cl_float4)
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
@@ -224,6 +241,7 @@ SYCL_PTANH(cl::sycl::cl_double2)
     return cl::sycl::ceil(a);                                           \
   }
 
+SYCL_PCEIL(cl::sycl::cl_half)
 SYCL_PCEIL(cl::sycl::cl_float4)
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
@@ -235,6 +253,7 @@ SYCL_PCEIL(cl::sycl::cl_double2)
     return cl::sycl::round(a);                                           \
   }
 
+SYCL_PROUND(cl::sycl::cl_half8)
 SYCL_PROUND(cl::sycl::cl_float4)
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
@@ -246,6 +265,7 @@ SYCL_PROUND(cl::sycl::cl_double2)
     return cl::sycl::rint(a);                                           \
   }
 
+SYCL_PRINT(cl::sycl::cl_half8)
 SYCL_PRINT(cl::sycl::cl_float4)
 SYCL_PRINT(cl::sycl::cl_double2)
 #undef SYCL_PRINT
@@ -257,6 +277,7 @@ SYCL_PRINT(cl::sycl::cl_double2)
     return cl::sycl::floor(a);                                           \
   }
 
+SYCL_FLOOR(cl::sycl::cl_half8)
 SYCL_FLOOR(cl::sycl::cl_float4)
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
@@ -268,6 +289,7 @@ SYCL_FLOOR(cl::sycl::cl_double2)
     return expr;                                                       \
   }
 
+SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
@@ -279,6 +301,7 @@ SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
     return expr;                                                       \
   }
 
+SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
@@ -292,6 +315,7 @@ SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
                                      cl::sycl::rounding_mode::automatic>()); \
   }
 
+SYCL_PLDEXP(cl::sycl::cl_half8)
 SYCL_PLDEXP(cl::sycl::cl_float4)
 SYCL_PLDEXP(cl::sycl::cl_double2)
 #undef SYCL_PLDEXP
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index fbdcdc024..4b0b1c6c0 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -44,9 +44,34 @@ SYCL_PLOAD(cl::sycl::cl_float4, u)
 SYCL_PLOAD(cl::sycl::cl_float4, )
 SYCL_PLOAD(cl::sycl::cl_double2, u)
 SYCL_PLOAD(cl::sycl::cl_double2, )
-
 #undef SYCL_PLOAD
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+    pload<cl::sycl::cl_half8>(
+        const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+ploadu<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
 #define SYCL_PSTORE(scalar, packet_type, alignment)             \
   template <>                                                   \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
@@ -59,9 +84,28 @@ SYCL_PSTORE(float, cl::sycl::cl_float4, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, u)
-
 #undef SYCL_PSTORE
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
+    Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
+    Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr = cl::sycl::address_space_cast<
+      cl::sycl::access::address_space::generic_space,
+      cl::sycl::access::decorated::no>(
+      reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
 #define SYCL_PSET1(packet_type)                                         \
   template <>                                                           \
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@@ -70,6 +114,7 @@ SYCL_PSTORE(double, cl::sycl::cl_double2, u)
   }
 
 // global space
+SYCL_PSET1(cl::sycl::cl_half8)
 SYCL_PSET1(cl::sycl::cl_float4)
 SYCL_PSET1(cl::sycl::cl_double2)
 
@@ -86,6 +131,58 @@ struct get_base_packet {
   get_pgather(sycl_multi_pointer, Index) {}
 };
 
+template <>
+struct get_base_packet<cl::sycl::cl_half8> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
+      sycl_multi_pointer from) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[3]),
+                              static_cast<cl::sycl::half>(from[3]));
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
+      sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
+                              static_cast<cl::sycl::half>(from[1 * stride]),
+                              static_cast<cl::sycl::half>(from[2 * stride]),
+                              static_cast<cl::sycl::half>(from[3 * stride]),
+                              static_cast<cl::sycl::half>(from[4 * stride]),
+                              static_cast<cl::sycl::half>(from[5 * stride]),
+                              static_cast<cl::sycl::half>(from[6 * stride]),
+                              static_cast<cl::sycl::half>(from[7 * stride]));
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
+      sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
+    auto tmp = stride;
+    to[0] = Eigen::half(from.s0());
+    to[tmp] = Eigen::half(from.s1());
+    to[tmp += stride] = Eigen::half(from.s2());
+    to[tmp += stride] = Eigen::half(from.s3());
+    to[tmp += stride] = Eigen::half(from.s4());
+    to[tmp += stride] = Eigen::half(from.s5());
+    to[tmp += stride] = Eigen::half(from.s6());
+    to[tmp += stride] = Eigen::half(from.s7());
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
+      const cl::sycl::half& a) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
+                              static_cast<cl::sycl::half>(a + 2),
+                              static_cast<cl::sycl::half>(a + 3),
+                              static_cast<cl::sycl::half>(a + 4),
+                              static_cast<cl::sycl::half>(a + 5),
+                              static_cast<cl::sycl::half>(a + 6),
+                              static_cast<cl::sycl::half>(a + 7));
+  }
+};
+
 template <>
 struct get_base_packet<cl::sycl::cl_float4> {
   template <typename sycl_multi_pointer>
@@ -152,6 +249,7 @@ struct get_base_packet<cl::sycl::cl_double2> {
     return get_base_packet<packet_type>::get_ploaddup(from);               \
   }
 
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
 
@@ -165,9 +263,14 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
   }
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
-
 #undef SYCL_PLSET
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
+  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
+}
+
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
   template <>                                                                  \
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
@@ -176,9 +279,9 @@ SYCL_PLSET(cl::sycl::cl_double2)
     return get_base_packet<packet_type>::get_pgather(from, stride);            \
   }
 
+SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
-
 #undef SYCL_PGATHER_SPECILIZE
 
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
@@ -189,6 +292,7 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
     get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
   }
 
+SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
 SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
 
@@ -201,10 +305,16 @@ SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
     return cl::sycl::mad(a, b, c);                                        \
   }
 
+SYCL_PMAD(cl::sycl::cl_half8)
 SYCL_PMAD(cl::sycl::cl_float4)
 SYCL_PMAD(cl::sycl::cl_double2)
 #undef SYCL_PMAD
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0());
+}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
@@ -216,6 +326,13 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
   return a.x();
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
+                     + a.s6() + a.s7());
+}
+
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
@@ -228,6 +345,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
   return a.x() + a.y();
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmax(
+          cl::sycl::fmax(
+            cl::sycl::fmax(a.s0(), a.s1()),
+            cl::sycl::fmax(a.s2(), a.s3())),
+          cl::sycl::fmax(
+            cl::sycl::fmax(a.s4(), a.s5()),
+            cl::sycl::fmax(a.s6(), a.s7()))));
+}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
@@ -240,6 +368,17 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
   return cl::sycl::fmax(a.x(), a.y());
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmin(
+      cl::sycl::fmin(
+          cl::sycl::fmin(a.s0(), a.s1()),
+          cl::sycl::fmin(a.s2(), a.s3())),
+      cl::sycl::fmin(
+          cl::sycl::fmin(a.s4(), a.s5()),
+          cl::sycl::fmin(a.s6(), a.s7()))));
+}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
@@ -252,6 +391,12 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
   return cl::sycl::fmin(a.x(), a.y());
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
+    const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
+                     a.s6() * a.s7());
+}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
     const cl::sycl::cl_float4& a) {
@@ -263,6 +408,14 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
   return a.x() * a.y();
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
+pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
+                            cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
+                            cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+                            cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
+}
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
@@ -300,6 +453,9 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
     return sycl_pcmp_##OP<TYPE>(a, b);                                         \
   }
 
+SYCL_PCMP(le, cl::sycl::cl_half8)
+SYCL_PCMP(lt, cl::sycl::cl_half8)
+SYCL_PCMP(eq, cl::sycl::cl_half8)
 SYCL_PCMP(le, cl::sycl::cl_float4)
 SYCL_PCMP(lt, cl::sycl::cl_float4)
 SYCL_PCMP(eq, cl::sycl::cl_float4)
@@ -308,6 +464,121 @@ SYCL_PCMP(lt, cl::sycl::cl_double2)
 SYCL_PCMP(eq, cl::sycl::cl_double2)
 #undef SYCL_PCMP
 
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
+    PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+  cl::sycl::cl_half tmp = kernel.packet[0].s1();
+  kernel.packet[0].s1() = kernel.packet[1].s0();
+  kernel.packet[1].s0() = tmp;
+
+  tmp = kernel.packet[0].s2();
+  kernel.packet[0].s2() = kernel.packet[2].s0();
+  kernel.packet[2].s0() = tmp;
+
+  tmp = kernel.packet[0].s3();
+  kernel.packet[0].s3() = kernel.packet[3].s0();
+  kernel.packet[3].s0() = tmp;
+
+  tmp = kernel.packet[0].s4();
+  kernel.packet[0].s4() = kernel.packet[4].s0();
+  kernel.packet[4].s0() = tmp;
+
+  tmp = kernel.packet[0].s5();
+  kernel.packet[0].s5() = kernel.packet[5].s0();
+  kernel.packet[5].s0() = tmp;
+
+  tmp = kernel.packet[0].s6();
+  kernel.packet[0].s6() = kernel.packet[6].s0();
+  kernel.packet[6].s0() = tmp;
+
+  tmp = kernel.packet[0].s7();
+  kernel.packet[0].s7() = kernel.packet[7].s0();
+  kernel.packet[7].s0() = tmp;
+
+  tmp = kernel.packet[1].s2();
+  kernel.packet[1].s2() = kernel.packet[2].s1();
+  kernel.packet[2].s1() = tmp;
+
+  tmp = kernel.packet[1].s3();
+  kernel.packet[1].s3() = kernel.packet[3].s1();
+  kernel.packet[3].s1() = tmp;
+
+  tmp = kernel.packet[1].s4();
+  kernel.packet[1].s4() = kernel.packet[4].s1();
+  kernel.packet[4].s1() = tmp;
+
+  tmp = kernel.packet[1].s5();
+  kernel.packet[1].s5() = kernel.packet[5].s1();
+  kernel.packet[5].s1() = tmp;
+
+  tmp = kernel.packet[1].s6();
+  kernel.packet[1].s6() = kernel.packet[6].s1();
+  kernel.packet[6].s1() = tmp;
+
+  tmp = kernel.packet[1].s7();
+  kernel.packet[1].s7() = kernel.packet[7].s1();
+  kernel.packet[7].s1() = tmp;
+
+  tmp = kernel.packet[2].s3();
+  kernel.packet[2].s3() = kernel.packet[3].s2();
+  kernel.packet[3].s2() = tmp;
+
+  tmp = kernel.packet[2].s4();
+  kernel.packet[2].s4() = kernel.packet[4].s2();
+  kernel.packet[4].s2() = tmp;
+
+  tmp = kernel.packet[2].s5();
+  kernel.packet[2].s5() = kernel.packet[5].s2();
+  kernel.packet[5].s2() = tmp;
+
+  tmp = kernel.packet[2].s6();
+  kernel.packet[2].s6() = kernel.packet[6].s2();
+  kernel.packet[6].s2() = tmp;
+
+  tmp = kernel.packet[2].s7();
+  kernel.packet[2].s7() = kernel.packet[7].s2();
+  kernel.packet[7].s2() = tmp;
+
+  tmp = kernel.packet[3].s4();
+  kernel.packet[3].s4() = kernel.packet[4].s3();
+  kernel.packet[4].s3() = tmp;
+
+  tmp = kernel.packet[3].s5();
+  kernel.packet[3].s5() = kernel.packet[5].s3();
+  kernel.packet[5].s3() = tmp;
+
+  tmp = kernel.packet[3].s6();
+  kernel.packet[3].s6() = kernel.packet[6].s3();
+  kernel.packet[6].s3() = tmp;
+
+  tmp = kernel.packet[3].s7();
+  kernel.packet[3].s7() = kernel.packet[7].s3();
+  kernel.packet[7].s3() = tmp;
+
+  tmp = kernel.packet[4].s5();
+  kernel.packet[4].s5() = kernel.packet[5].s4();
+  kernel.packet[5].s4() = tmp;
+
+  tmp = kernel.packet[4].s6();
+  kernel.packet[4].s6() = kernel.packet[6].s4();
+  kernel.packet[6].s4() = tmp;
+
+  tmp = kernel.packet[4].s7();
+  kernel.packet[4].s7() = kernel.packet[7].s4();
+  kernel.packet[7].s4() = tmp;
+
+  tmp = kernel.packet[5].s6();
+  kernel.packet[5].s6() = kernel.packet[6].s5();
+  kernel.packet[6].s5() = tmp;
+
+  tmp = kernel.packet[5].s7();
+  kernel.packet[5].s7() = kernel.packet[7].s5();
+  kernel.packet[7].s5() = tmp;
+
+  tmp = kernel.packet[6].s7();
+  kernel.packet[6].s7() = kernel.packet[7].s6();
+  kernel.packet[7].s6() = tmp;
+}
+
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
     PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
   float tmp = kernel.packet[0].y();
@@ -342,6 +613,19 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
   kernel.packet[1].x() = tmp;
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
+    const cl::sycl::cl_half8& thenPacket,
+    const cl::sycl::cl_half8& elsePacket) {
+  cl::sycl::cl_short8 condition(
+      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
+      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
+      ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+      ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
     const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index cb74031fa..480e3ac02 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1118,7 +1118,7 @@ struct GeneralVectorTensor {
                                               : globalContractDimOffset + privateOffsetC)
                              : OutScalar(0);
 
-        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+        outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
         privateOffsetNC += Properties::LocalThreadSizeNC;
       }
       privateOffsetC += Properties::LocalThreadSizeC;
@@ -1263,7 +1263,7 @@ struct GeneralScalarContraction {
     StorageIndex localid = itemID.get_local_id(0);
     OutScalar accumulator = OutScalar(0);
     for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+      accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
     }
     auto out_scratch_ptr = scratch_ptr + localid;
     *out_scratch_ptr = accumulator;
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 1d40ae56c..d41baf2ea 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -151,6 +151,7 @@ if(EIGEN_TEST_SYCL)
   ei_add_test(cxx11_tensor_argmax_sycl)
   ei_add_test(cxx11_tensor_custom_op_sycl)
   ei_add_test(cxx11_tensor_scan_sycl)
+  ei_add_test(cxx11_tensor_of_float16_sycl)
   set(EIGEN_SYCL OFF)
 endif()
 
diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
index 41ea3cf7b..8f4e09573 100644
--- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp
@@ -32,9 +32,9 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) {
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_max;
   Tensor<DenseIndex, 0, Layout, DenseIndex> out_min;
   in.setRandom();
-  in *= in.constant(100.0);
-  in(0, 0, 0) = -1000.0;
-  in(1, 1, 1) = 1000.0;
+  in *= in.constant(static_cast<DataType>(100.0));
+  in(0, 0, 0) = static_cast<DataType>(-1000.0);
+  in(1, 1, 1) = static_cast<DataType>(1000.0);
 
   std::size_t in_bytes = in.size() * sizeof(DataType);
   std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
@@ -93,7 +93,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l)
             // = 10.0
-            tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? -1.0 : 10.0);
           }
         }
       }
@@ -132,7 +132,7 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) {
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0);
           }
         }
       }
@@ -180,7 +180,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
-            tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != 0) ? 1.0 : -10.0);
           }
         }
       }
@@ -219,7 +219,7 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) {
             ix[2] = k;
             ix[3] = l;
             // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
-            tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0;
+            tensor(ix) = static_cast<DataType>((ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0);
           }
         }
       }
@@ -252,6 +252,7 @@ void sycl_argmax_test_per_device(const Device_Selector& d) {
 
 EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) {
   for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_argmax_test_per_device<half>(device));
     CALL_SUBTEST(sycl_argmax_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 20f84b8e0..6ca5ac714 100644
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -139,6 +139,7 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
 
 EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_broadcast_test_per_device<half>(device));
     CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index be7afd200..c8668d847 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -28,26 +28,135 @@ using Eigen::TensorMap;
 // Functions used to compare the TensorMap implementation on the device with
 // the equivalent on the host
 namespace SYCL {
-template <typename T> T abs(T x) { 
+
+template <typename T> T abs(T x) {
   return cl::sycl::abs(x);
 }
+template <> Eigen::half abs(Eigen::half x) {
+  return Eigen::half(cl::sycl::fabs(static_cast<cl::sycl::half>(x)));
+}
 
-template <> float abs(float x) { 
+template <> float abs(float x) {
   return cl::sycl::fabs(x);
 }
 
-template <> double abs(double x) { 
+template <> double abs(double x) {
   return cl::sycl::fabs(x);
 }
 
 template <typename T> T square(T x) { return x * x; }
 template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return T(1) / x; }
-template <typename T> T cwiseMax(T x, T y) { 
-return cl::sycl::max(x, y);
+template <typename T> T cwiseMax(T x, T y) {
+  return cl::sycl::max(x, y);
 }
-template <typename T> T cwiseMin(T x, T y) { 
-  return cl::sycl::min(x, y); 
+template <> Eigen::half cwiseMax(Eigen::half x, Eigen::half y) {
+return Eigen::half(cl::sycl::max(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T cwiseMin(T x, T y) {
+  return cl::sycl::min(x, y);
+}
+template <> Eigen::half cwiseMin(Eigen::half x, Eigen::half y) {
+  return Eigen::half(cl::sycl::min(static_cast<cl::sycl::half>(x), static_cast<cl::sycl::half>(y)));
+}
+
+template <typename T> T sqrt(T x) {
+  return cl::sycl::sqrt(x);
+}
+template <> Eigen::half sqrt(Eigen::half x) {
+  return Eigen::half(cl::sycl::sqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T rsqrt(T x) {
+  return cl::sycl::rsqrt(x);
+}
+template <> Eigen::half rsqrt(Eigen::half x) {
+  return Eigen::half(cl::sycl::rsqrt(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T tanh(T x) {
+  return cl::sycl::tanh(x);
+}
+template <> Eigen::half tanh(Eigen::half x) {
+  return Eigen::half(cl::sycl::tanh(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T exp(T x) {
+  return cl::sycl::exp(x);
+}
+template <> Eigen::half exp(Eigen::half x) {
+  return Eigen::half(cl::sycl::exp(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T expm1(T x) {
+  return cl::sycl::expm1(x);
+}
+template <> Eigen::half expm1(Eigen::half x) {
+  return Eigen::half(cl::sycl::expm1(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log(T x) {
+  return cl::sycl::log(x);
+}
+template <> Eigen::half log(Eigen::half x) {
+  return Eigen::half(cl::sycl::log(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T ceil(T x) {
+  return cl::sycl::ceil(x);
+}
+template <> Eigen::half ceil(Eigen::half x) {
+  return Eigen::half(cl::sycl::ceil(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T floor(T x) {
+  return cl::sycl::floor(x);
+}
+template <> Eigen::half floor(Eigen::half x) {
+  return Eigen::half(cl::sycl::floor(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T round(T x) {
+  return cl::sycl::round(x);
+}
+template <> Eigen::half round(Eigen::half x) {
+  return Eigen::half(cl::sycl::round(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T log1p(T x) {
+  return cl::sycl::log1p(x);
+}
+template <> Eigen::half log1p(Eigen::half x) {
+  return Eigen::half(cl::sycl::log1p(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T sign(T x) {
+  return cl::sycl::sign(x);
+}
+template <> Eigen::half sign(Eigen::half x) {
+  return Eigen::half(cl::sycl::sign(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isnan(T x) {
+  return cl::sycl::isnan(x);
+}
+template <> Eigen::half isnan(Eigen::half x) {
+  return Eigen::half(cl::sycl::isnan(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isfinite(T x) {
+  return cl::sycl::isfinite(x);
+}
+template <> Eigen::half isfinite(Eigen::half x) {
+  return Eigen::half(cl::sycl::isfinite(static_cast<cl::sycl::half>(x)));
+}
+
+template <typename T> T isinf(T x) {
+  return cl::sycl::isinf(x);
+}
+template <> Eigen::half isinf(Eigen::half x) {
+  return Eigen::half(cl::sycl::isinf(static_cast<cl::sycl::half>(x)));
 }
 }
 
@@ -157,15 +266,14 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
 #define DECLARE_UNARY_STRUCT(FUNC)                                 \
   struct op_##FUNC {                                               \
     template <typename T>                                          \
-    auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) {   \
-      return cl::sycl::FUNC(x);                                    \
+    auto operator()(const T& x) -> decltype(SYCL::FUNC(x)) {   \
+      return SYCL::FUNC(x);                                    \
     }                                                              \
     template <typename T>                                          \
     auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \
       return x.FUNC();                                             \
     }                                                              \
-  };
-
+};
 
 DECLARE_UNARY_STRUCT(sqrt)
 DECLARE_UNARY_STRUCT(rsqrt)
@@ -390,8 +498,10 @@ EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     QueueInterface queueInterface(device);
     Eigen::SyclDevice sycl_device(&queueInterface);
-    CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device));
-    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device));
-    CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device));
+    CALL_SUBTEST_1(test_builtin_unary_sycl<half>(sycl_device));
+    CALL_SUBTEST_2(test_floating_builtin_binary_sycl<half>(sycl_device));
+    CALL_SUBTEST_3(test_builtin_unary_sycl<float>(sycl_device));
+    CALL_SUBTEST_4(test_floating_builtin_binary_sycl<float>(sycl_device));
+    CALL_SUBTEST_5(test_integer_builtin_binary_sycl<int>(sycl_device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
index 1e7093104..b018da89a 100644
--- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp
@@ -619,5 +619,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
     CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
+    CALL_SUBTEST(sycl_chipping_test_per_device<half>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
index 765991b35..18dc95b67 100644
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@@ -175,6 +175,7 @@ template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(tensorConcat_perDevice<half>(device));
     CALL_SUBTEST(tensorConcat_perDevice<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp
index 2f75ef896..66a13e817 100644
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -82,6 +82,7 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
 
 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_device_test_per_device<half>(device));
     CALL_SUBTEST(sycl_device_test_per_device<float>(device));
     CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
   }
diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
index db1c0206e..083c004fe 100644
--- a/unsupported/test/cxx11_tensor_image_op_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp
@@ -95,6 +95,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
 
 EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) { 
+    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
    CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
    CALL_SUBTEST(sycl_computing_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index 521ae0cc3..75c2c0e3e 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
@@ -131,6 +131,7 @@ template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_
 EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_inflation_test_per_device<half>(device));
     CALL_SUBTEST(sycl_inflation_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
index 9546b911c..f556d84ea 100644
--- a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp
@@ -121,6 +121,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_
 EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<half>(device));
     CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp
index 029653e27..84d638e5b 100644
--- a/unsupported/test/cxx11_tensor_math_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_math_sycl.cpp
@@ -100,6 +100,7 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
 
 EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_computing_test_per_device<half>(device));
     CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
index bf001b40f..a1545d49a 100644
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@@ -381,6 +381,7 @@ template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_d
 EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_morphing_test_per_device<half>(device));
     CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_of_float16_sycl.cpp b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
new file mode 100644
index 000000000..1624acc4f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_of_float16_sycl.cpp
@@ -0,0 +1,406 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023
+// Alejandro Acosta  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+#define EIGEN_SYCL_HALF_SUPPORT
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::SyclDevice;
+
+void test_gpu_numext(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  bool* d_res_half = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+  bool* d_res_float = static_cast<bool*>(sycl_device.allocate(num_elem * sizeof(bool)));
+
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float(d_float, num_elem);
+  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+  Eigen::TensorMap<Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(bool));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(bool));
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_conversion(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  Eigen::half* d_half = static_cast<Eigen::half*>(sycl_device.allocate(num_elem * sizeof(Eigen::half)));
+  float* d_conv = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random();
+  gpu_half.device(sycl_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(sycl_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  sycl_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+}
+
+void test_gpu_unary(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.abs();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_elementwise(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float1 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_float2 = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_res_half = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+  float* d_res_float = static_cast<float*>(sycl_device.allocate(num_elem * sizeof(float)));
+
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_half(d_res_half, num_elem);
+  Eigen::TensorMap<Tensor<float, 1>, Eigen::Aligned> gpu_res_float(d_res_float, num_elem);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random();
+  gpu_float2.device(sycl_device) = gpu_float2.random();
+  gpu_res_float.device(sycl_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(sycl_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half,num_elem * sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float,num_elem * sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+}
+
+void test_gpu_trancendental(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(sycl_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(sycl_device) = gpu_float3.random();
+  gpu_res1_float.device(sycl_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(sycl_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(sycl_device) = gpu_float3.log1p().cast<Eigen::half>();
+  gpu_res4_float.device(sycl_device) = gpu_float3.expm1().cast<Eigen::half>();
+
+  gpu_res1_half.device(sycl_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(sycl_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(sycl_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(sycl_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(sycl_device) = gpu_res3_half.log1p();
+
+  gpu_res3_half.device(sycl_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(sycl_device) = gpu_res3_half.expm1();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  sycl_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+}
+
+void test_gpu_contractions(const Eigen::SyclDevice &sycl_device) {
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(sycl_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(sycl_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef typename Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  gpu_res_float.device(sycl_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device, int size1, int size2, int redux) {
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl;
+
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float.device(sycl_device) = gpu_float.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim = {redux};
+  gpu_res_float.device(sycl_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+}
+
+void test_gpu_reductions(const Eigen::SyclDevice &sycl_device) {
+  test_gpu_reductions(sycl_device, 13, 13, 0);
+  test_gpu_reductions(sycl_device, 13, 13, 1);
+
+  test_gpu_reductions(sycl_device, 35, 36, 0);
+  test_gpu_reductions(sycl_device, 35, 36, 1);
+
+  test_gpu_reductions(sycl_device, 36, 35, 0);
+  test_gpu_reductions(sycl_device, 36, 35, 1);
+}
+
+void test_gpu_full_reductions(const Eigen::SyclDevice &sycl_device) {
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)sycl_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float.device(sycl_device) = gpu_float.random();
+
+  gpu_res_float.device(sycl_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(sycl_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(sycl_device) = gpu_float.cast<Eigen::half>().maximum();
+  sycl_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  sycl_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+}
+
+void test_gpu_forced_evals(const Eigen::SyclDevice &sycl_device) {
+  int num_elem = 101;
+
+  float* d_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)sycl_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)sycl_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(sycl_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(sycl_device) = gpu_float.abs();
+  gpu_res_half1.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(sycl_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  sycl_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float));
+  sycl_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  sycl_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+}
+
+EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_sycl)
+{
+  for (const auto& s : Eigen::get_sycl_supported_devices()) {
+    QueueInterface queueInterface(s);
+    auto sycl_device = Eigen::SyclDevice(&queueInterface);
+
+    CALL_SUBTEST_1(test_gpu_numext(sycl_device));
+    CALL_SUBTEST_1(test_gpu_conversion(sycl_device));
+    CALL_SUBTEST_1(test_gpu_unary(sycl_device));
+    CALL_SUBTEST_1(test_gpu_elementwise(sycl_device));
+    CALL_SUBTEST_1(test_gpu_trancendental(sycl_device));
+    CALL_SUBTEST_2(test_gpu_contractions(sycl_device));
+    CALL_SUBTEST_3(test_gpu_reductions(sycl_device));
+    CALL_SUBTEST_4(test_gpu_full_reductions(sycl_device));
+    CALL_SUBTEST_5(test_gpu_forced_evals(sycl_device));
+  }
+}
diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp
index 727a9ffd7..9b16bb150 100644
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@@ -152,6 +152,7 @@ template<typename DataType, typename dev_Selector> void sycl_padding_test_per_de
 EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_padding_test_per_device<half>(device));
     CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp
index 7f92bec78..1216ad26a 100644
--- a/unsupported/test/cxx11_tensor_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp
@@ -244,6 +244,7 @@ template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_p
 EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_tensor_patch_test_per_device<half>(device));
     CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp
index 14a7c489c..dcdfdd3c0 100644
--- a/unsupported/test/cxx11_tensor_random_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_random_sycl.cpp
@@ -78,6 +78,7 @@ template<typename DataType, typename dev_Selector> void sycl_random_test_per_dev
 EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl)
 {
   for (const auto& device :Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_random_test_per_device<half>(device));
     CALL_SUBTEST(sycl_random_test_per_device<float>(device));
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
     CALL_SUBTEST(sycl_random_test_per_device<double>(device));
diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
index dd30c235d..5ed007f70 100644
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@@ -248,6 +248,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) {
 #ifdef EIGEN_SYCL_DOUBLE_SUPPORT
     CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device));
 #endif
-    CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device));
+    CALL_SUBTEST_5(sycl_reverse_test_per_device<half>(device));
+    CALL_SUBTEST_6(sycl_reverse_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
index ca4e8b5ef..70c4116a1 100644
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@@ -112,6 +112,7 @@ void sycl_shuffling_test_per_device(dev_Selector s) {
 }
 EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) {
   for (const auto& device : Eigen::get_sycl_supported_devices()) {
+    CALL_SUBTEST(sycl_shuffling_test_per_device<half>(device));
     CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
   }
 }
diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
index 8d99a48ed..dc86dfee9 100644
--- a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp
@@ -217,6 +217,7 @@ test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device);
 EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl)
 {
 for (const auto& device :Eigen::get_sycl_supported_devices()) {
+  CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<half>(device));
   CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device));
 }
 }