Replace memset with fill to work for non-trivial scalars.

For custom scalars, zero is not necessarily represented by a zeroed-out memory block (e.g. gnu MPFR). We therefore cannot rely on `memset` if we want to fill a matrix or tensor with zeroes. Instead, we should rely on `fill`, which for trivial types does end up getting converted to a `memset` under-the-hood (at least with gcc/clang). Requires adding a `fill(begin, end, v)` to `TensorDevice`. Replaced all potentially bad instances of memset with fill. Fixes #2245.
2025-09-23 14:53:13 +08:00 · 2021-05-11 09:52:00 -07:00 · 2021-05-11 09:52:00 -07:00 · 1e6c6c1576
commit 1e6c6c1576
parent e9c9a3130b
18 changed files with 229 additions and 61 deletions
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@ -253,9 +253,10 @@ class SparseMatrix
    inline void setZero()
    {
      m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
+      std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
-      if(m_innerNonZeros)
+      if(m_innerNonZeros) {
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+        std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
      }
    }
    /** Preallocates \a reserveSize non zeros.
@ -641,7 +642,7 @@ class SparseMatrix
        std::free(m_innerNonZeros);
        m_innerNonZeros = 0;
      }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
+      std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
    }
    /** \internal
@ -1260,7 +1261,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
-      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
      // pack all inner-vectors to the end of the pre-allocated space
      // and allocate the entire free-space to the first inner-vector
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -564,9 +564,9 @@ for (int iter = 0; iter < 10; ++iter) {
    // Initialize the content of the memory pools to prevent asan from
    // complaining.
-    device_.memset(a_, 12, m_ * k_ * sizeof(T));
+    device_.fill(a_, a_ + m_ * k_, T(12));
-    device_.memset(b_, 23, k_ * n_ * sizeof(T));
+    device_.fill(b_, b_ + k_ * n_, T(23));
-    device_.memset(c_, 31, m_ * n_ * sizeof(T));
+    device_.fill(c_, c_ + m_ * n_, T(31));
  }
--- a/bench/tensors/tensor_contract_sycl_bench.cc
+++ b/bench/tensors/tensor_contract_sycl_bench.cc
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
  // Initialize the content of the memory pools to prevent asan from
  // complaining.
-  device_.memset(a_, 12, m_ * k_ * sizeof(T));
+  device_.fill(a_, m_ * k_, T(12));
-  device_.memset(b_, 23, k_ * n_ * sizeof(T));
+  device_.fill(b_, k_ * n_, T(23));
-  device_.memset(c_, 31, m_ * n_ * sizeof(T));
+  device_.fill(c_, m_ * n_, T(31));
  Eigen::array<TensorIndex, 2> sizeA;
  sizeA[0] = m_;
--- a/test/OffByOneScalar.h
+++ b/test/OffByOneScalar.h
@ -0,0 +1,28 @@
 // A Scalar with internal representation T+1 so that zero is internally
 // represented by T(1). This is used to test memory fill.
 // 
 template<typename T>
 class OffByOneScalar {
 public:
  OffByOneScalar() : val_(1) {}
  OffByOneScalar(const OffByOneScalar& other) {
    *this = other;
  }
  OffByOneScalar& operator=(const OffByOneScalar& other) {
    val_ = other.val_;
    return *this;
  }
  OffByOneScalar(T val) : val_(val + 1) {}
  OffByOneScalar& operator=(T val) {
    val_ = val + 1;
  }
  operator T() const {
    return val_ - 1;
  }
 private:
  T val_;
 };
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -762,7 +762,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
    const Index resIncr(1);
    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
-    m_device.memset(buffer, 0, rows * sizeof(Scalar));
+    m_device.fill(buffer, buffer + rows, Scalar(0));
    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
        rows, cols, lhs, rhs,
@ -869,7 +869,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
    // If a contraction kernel does not support beta, explicitly initialize
    // output buffer with zeroes.
    if (!TensorContractionKernel::HasBeta) {
-      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+      this->m_device.fill(buffer, buffer + m * n, Scalar(0));
    }
    for(Index i2=0; i2<m; i2+=mc)
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@ -1370,8 +1370,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
    // columns in right side
    const Index n = this->m_j_size;
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+    this->m_device.fill(buffer, buffer + m * n, Scalar(0));
    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
                                                   LeftEvaluator, left_nocontract_t,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@ -912,9 +912,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
          // x m) row. Safe to do here because all kernels that will write to
          // this memory depend on completion of this task. Note: don't call
-          // device_.memset() here. device_.memset() blocks on thread pool
+          // device_.fill() here. device_.fill() blocks on thread pool
          // worker thread, which can lead to underutilization and deadlocks.
-          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+          std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
        }
        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@ -39,6 +39,17 @@ struct DefaultDevice {
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
    ::memset(buffer, c, n);
  }
  template<typename T>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
 #ifdef EIGEN_GPU_COMPILE_PHASE
    // std::fill is not a device function, so resort to simple loop.
    for (T* it = begin; it != end; ++it) {
      *it = value;
    }
 #else
    std::fill(begin, end, value);
 #endif
  }
  template<typename Type>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
    return data;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -281,10 +281,35 @@ struct GpuDevice {
    EIGEN_UNUSED_VARIABLE(err)
    gpu_assert(err == gpuSuccess);
 #else
  EIGEN_UNUSED_VARIABLE(buffer)
  EIGEN_UNUSED_VARIABLE(c)
  EIGEN_UNUSED_VARIABLE(n)
  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
  template<typename T>
  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
 #ifndef EIGEN_GPU_COMPILE_PHASE
    const size_t count = end - begin;
    // Split value into bytes and run memset with stride.
    const int value_size = sizeof(value);
    char* buffer = (char*)begin;
    char* value_bytes = (char*)(&value);
    gpuError_t err;
    EIGEN_UNUSED_VARIABLE(err)
    for (int b=0; b<value_size; ++b) {
      err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
      gpu_assert(err == gpuSuccess);
    }
 #else
    EIGEN_UNUSED_VARIABLE(begin)
    EIGEN_UNUSED_VARIABLE(end)
    EIGEN_UNUSED_VARIABLE(value)
    eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
  }
  EIGEN_STRONG_INLINE size_t numThreads() const {
    // FIXME
    return 32;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -327,13 +327,27 @@ class QueueInterface {
    if (n == 0) {
      return;
    }
    n /= sizeof(buffer_scalar_t);
    auto f = [&](cl::sycl::handler &cgh) {
-      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+      // Get a typed range accesser to ensure we fill each byte, in case
-      // The cast to uint8_t is here to match the behaviour of the standard
+      // `buffer_scalar_t` is not (u)int8_t.
-      // memset. The cast to buffer_scalar_t is needed to match the type of the
+      auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
-      // accessor (in case buffer_scalar_t is not uint8_t)
+      cgh.fill(dst_acc, static_cast<uint8_t>(c));
-      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+    };
    cl::sycl::event e;
    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
    async_synchronize(e);
  }
  template<typename T>
  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
    static const auto write_mode = cl::sycl::access::mode::discard_write;
    if (begin == end) {
      return;
    }
    const ptrdiff_t count = end - begin;
    auto f = [&](cl::sycl::handler &cgh) {
      auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
      cgh.fill(dst_acc, value);
    };
    cl::sycl::event e;
    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
@ -359,6 +373,8 @@ class QueueInterface {
    auto original_buffer = pMapper.get_buffer(ptr);
    const ptrdiff_t offset = pMapper.get_offset(ptr);
    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
    eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
    const ptrdiff_t typed_offset = offset / sizeof(T);
    eigen_assert(typed_offset >= 0);
    const auto typed_size = original_buffer.get_size() / sizeof(T);
@ -395,6 +411,40 @@ class QueueInterface {
        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
  }
  /// Get a range accessor to the virtual pointer's device memory with a
  /// specified type and count.
  template <cl::sycl::access::mode AcMd, typename T, typename Index>
  EIGEN_STRONG_INLINE cl::sycl::accessor<
      T, 1, AcMd, cl::sycl::access::target::global_buffer>
  get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
                     const Index count) const {
    static const auto global_access = cl::sycl::access::target::global_buffer;
    eigen_assert(count >= 0);
    std::lock_guard<std::mutex> lock(pmapper_mutex_);
    auto buffer = pMapper.get_buffer(ptr);
    const ptrdiff_t offset = pMapper.get_offset(ptr);
    eigen_assert(offset >= 0);
    // Technically we should create a subbuffer for the desired range,
    // then reinterpret that.  However, I was not able to get changes to reflect
    // in the original buffer (only the subbuffer and reinterpretted buffer).
    // This current implementation now has the restriction that the buffer
    // offset and original buffer size must be a multiple of sizeof(T).
    // Note that get_range_accessor(void*) currently has the same restriction.
    //
    // auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer, 
    //     cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
    eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
    eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
    const ptrdiff_t typed_offset = offset / sizeof(T);
    const size_t typed_size = buffer.get_size() / sizeof(T);
    auto reint = buffer.template reinterpret<
        typename Eigen::internal::remove_const<T>::type>(
        cl::sycl::range<1>(typed_size));
    return reint.template get_access<AcMd, global_access>(
        cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
  }
  /// Creation of sycl accessor for a buffer. This function first tries to find
  /// the buffer in the buffer_map. If found it gets the accessor from it, if
  /// not, the function then adds an entry by creating a sycl buffer for that
@ -912,15 +962,6 @@ struct SyclDevice : public SyclDeviceBase {
    return queue_stream()->get(data);
  }
  /// attach existing buffer
  EIGEN_STRONG_INLINE void *attach_buffer(
      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
    return queue_stream()->attach_buffer(buf);
  }
  /// detach buffer
  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
    queue_stream()->detach_buffer(p);
  }
  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
    return queue_stream()->get_offset(ptr);
  }
@ -951,6 +992,11 @@ struct SyclDevice : public SyclDeviceBase {
  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
    queue_stream()->memset(data, c, n);
  }
  /// the fill function
  template<typename T>
  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
    queue_stream()->fill(begin, end, value);
  }
  /// returning the sycl queue
  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
    return queue_stream()->sycl_queue();
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@ -122,6 +122,11 @@ struct ThreadPoolDevice {
    ::memset(buffer, c, n);
  }
  template<typename T>
  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
    std::fill(begin, end, value);
  }
  EIGEN_STRONG_INLINE int numThreads() const {
    return num_threads_;
  }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@ -41,6 +41,7 @@
 #define gpuMalloc hipMalloc
 #define gpuFree hipFree
 #define gpuMemsetAsync hipMemsetAsync
 #define gpuMemset2DAsync hipMemset2DAsync
 #define gpuMemcpyAsync hipMemcpyAsync
 #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
@ -71,6 +72,7 @@
 #define gpuMalloc cudaMalloc
 #define gpuFree cudaFree
 #define gpuMemsetAsync cudaMemsetAsync
 #define gpuMemset2DAsync cudaMemset2DAsync
 #define gpuMemcpyAsync cudaMemcpyAsync
 #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@ -26,6 +26,7 @@
 #undef gpuMalloc
 #undef gpuFree
 #undef gpuMemsetAsync
 #undef gpuMemset2DAsync
 #undef gpuMemcpyAsync
 #undef gpuMemcpyDeviceToDevice
 #undef gpuMemcpyDeviceToHost
--- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@ -375,8 +375,8 @@ public:
    /** Removes all non zeros */
    inline void setZero() {
        m_data.clear();
-        memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+        std::fill_n(m_colStartIndex, m_outerSize + 1, Index(0));
-        memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+        std::fill_n(m_rowStartIndex, m_outerSize + 1, Index(0));
    }
    /** \returns the number of non zero coefficients */
@ -435,7 +435,7 @@ public:
                    }
                    //zeros new data
-                    memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_upperPtr() + start, bandIncrement - 1, Scalar(0));
                    return m_data.upper(m_colStartIndex[inner]);
                } else {
@ -466,7 +466,7 @@ public:
                    }
                    //zeros new data
-                    memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_lowerPtr() + start, bandIncrement - 1, Scalar(0));
                    return m_data.lower(m_rowStartIndex[outer]);
                } else {
                    return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
@ -493,7 +493,7 @@ public:
                    for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
                        m_rowStartIndex[innerIdx] += bandIncrement;
                    }
-                    memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, bandIncrement - 1, Scalar(0));
                    return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
                } else {
                    return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
@ -520,7 +520,7 @@ public:
                    for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
                        m_colStartIndex[innerIdx] += bandIncrement;
                    }
-                    memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+                    std::fill_n(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, bandIncrement - 1, Scalar(0));
                    return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
                } else {
                    return m_data.lower(m_colStartIndex[outer] + (inner - outer));
@ -619,8 +619,8 @@ public:
        m_data.clear();
        m_outerSize = diagSize;
-        memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
+        std::fill_n(m_colStartIndex, cols + 1, Index(0));
-        memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
+        std::fill_n(m_rowStartIndex, rows + 1, Index(0));
    }
    void resizeNonZeros(Index size) {
--- a/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ b/unsupported/Eigen/src/Skyline/SkylineStorage.h
@ -187,11 +187,11 @@ public:
    }
    inline void reset() {
-        memset(m_diag, 0, m_diagSize * sizeof (Scalar));
+        std::fill_n(m_diag, m_diagSize, Scalar(0));
-        memset(m_upper, 0, m_upperSize * sizeof (Scalar));
+        std::fill_n(m_upper, m_upperSize, Scalar(0));
-        memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
+        std::fill_n(m_lower, m_lowerSize, Scalar(0));
-        memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
+        std::fill_n(m_upperProfile, m_diagSize, Index(0));
-        memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
+        std::fill_n(m_lowerProfile, m_diagSize, Index(0));
    }
    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
--- a/unsupported/test/cxx11_tensor_assign.cpp
+++ b/unsupported/test/cxx11_tensor_assign.cpp
@ -25,10 +25,8 @@ static void test_1d()
  vec1(4) = 23; vec2(4) = 4;
  vec1(5) = 42; vec2(5) = 5;
-  int col_major[6];
+  int col_major[6] = {0};
-  int row_major[6];
+  int row_major[6] = {0};
  memset(col_major, 0, 6*sizeof(int));
  memset(row_major, 0, 6*sizeof(int));
  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
@ -88,10 +86,8 @@ static void test_2d()
  mat2(1,1) = 4;
  mat2(1,2) = 5;
-  int col_major[6];
+  int col_major[6] = {0};
-  int row_major[6];
+  int row_major[6] = {0};
  memset(col_major, 0, 6*sizeof(int));
  memset(row_major, 0, 6*sizeof(int));
  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
@ -148,10 +144,8 @@ static void test_3d()
    }
  }
-  int col_major[2*3*7];
+  int col_major[2*3*7] = {0};
-  int row_major[2*3*7];
+  int row_major[2*3*7] = {0};
  memset(col_major, 0, 2*3*7*sizeof(int));
  memset(row_major, 0, 2*3*7*sizeof(int));
  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@ -14,6 +14,7 @@
 #define EIGEN_USE_GPU
 #include "main.h"
 #include "OffByOneScalar.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
@ -175,6 +176,44 @@ void test_3d_convolution(Context* context)
  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
 }
 // Helper method to synchronize device.
 template<typename Device>
 void synchronize(Device& device) { /*nothing*/ }
 template<>
 void synchronize(Eigen::GpuDevice& device) {
  device.synchronize();
 }
 template <typename DataType, typename TensorDevice>
 void test_device_memory(const TensorDevice& device) {
  int count = 100;
  Eigen::array<int, 1> tensorRange = {{count}};
  Eigen::Tensor<DataType, 1> host(tensorRange);
  Eigen::Tensor<DataType, 1> expected(tensorRange);
  DataType* device_data  = static_cast<DataType*>(device.allocate(count * sizeof(DataType)));
  // memset
  const char byte_value = static_cast<char>(0xAB);
  device.memset(device_data, byte_value, count * sizeof(DataType));
  device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
  synchronize(device);
  memset(expected.data(), byte_value, count * sizeof(DataType));
  for (size_t i=0; i<count; i++) {
    VERIFY_IS_EQUAL(host(i), expected(i));
  }
  // fill
  DataType fill_value = DataType(7);
  std::fill_n(expected.data(), count, fill_value);
  device.fill(device_data, device_data + count, fill_value);
  device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
  synchronize(device);
  for (int i=0; i<count; i++) {
    VERIFY_IS_EQUAL(host(i), expected(i));
  }
  device.deallocate(device_data);
 }
 void test_cpu() {
  Eigen::Tensor<float, 3> in1(40,50,70);
@ -266,6 +305,9 @@ void test_cpu() {
      }
    }
  }
  test_device_memory<float>(context.device());
  test_device_memory<OffByOneScalar<int>>(context.device());
 }
 void test_gpu() {
@ -386,6 +428,8 @@ void test_gpu() {
 #endif
  test_device_memory<float>(context.device());
  test_device_memory<OffByOneScalar<int>>(context.device());
 }
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -18,26 +18,36 @@
 #define EIGEN_USE_SYCL
 #include "main.h"
 #include "OffByOneScalar.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <stdint.h>
 #include <iostream>
 template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  std::cout << "Running on : "
            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
            <<std::endl;
  IndexType sizeDim1 = 100;
  array<IndexType, 1> tensorRange = {{sizeDim1}};
  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
  memset(in1.data(), 1, in1.size() * sizeof(DataType));
  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
  // memset
  memset(in1.data(), 1, in1.size() * sizeof(DataType));
  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
  for (IndexType i=0; i<in.size(); i++) {
    VERIFY_IS_EQUAL(in(i), in1(i));
  }
  // fill
  DataType value = DataType(7);
  std::fill_n(in1.data(), in1.size(), value);
  sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
  for (IndexType i=0; i<in.size(); i++) {
    VERIFY_IS_EQUAL(in(i), in1(i));
  }
  sycl_device.deallocate(gpu_in_data);
 }
@ -73,5 +83,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
 EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
    CALL_SUBTEST(sycl_device_test_per_device<float>(device));
    CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
  }
 }