Replace memset with fill to work for non-trivial scalars.

For custom scalars, zero is not necessarily represented by
a zeroed-out memory block (e.g. gnu MPFR). We therefore
cannot rely on `memset` if we want to fill a matrix or tensor
with zeroes. Instead, we should rely on `fill`, which for trivial
types does end up getting converted to a `memset` under-the-hood
(at least with gcc/clang).

Requires adding a `fill(begin, end, v)` to `TensorDevice`.

Replaced all potentially bad instances of memset with fill.

Fixes #2245.
This commit is contained in:
Antonio Sanchez 2021-05-11 09:52:00 -07:00 committed by Rasmus Munk Larsen
parent e9c9a3130b
commit 1e6c6c1576
18 changed files with 229 additions and 61 deletions

View File

@ -253,9 +253,10 @@ class SparseMatrix
inline void setZero() inline void setZero()
{ {
m_data.clear(); m_data.clear();
memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
if(m_innerNonZeros) if(m_innerNonZeros) {
memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
}
} }
/** Preallocates \a reserveSize non zeros. /** Preallocates \a reserveSize non zeros.
@ -641,7 +642,7 @@ class SparseMatrix
std::free(m_innerNonZeros); std::free(m_innerNonZeros);
m_innerNonZeros = 0; m_innerNonZeros = 0;
} }
memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex)); std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
} }
/** \internal /** \internal
@ -1260,7 +1261,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex))); m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
if(!m_innerNonZeros) internal::throw_std_bad_alloc(); if(!m_innerNonZeros) internal::throw_std_bad_alloc();
memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex)); std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
// pack all inner-vectors to the end of the pre-allocated space // pack all inner-vectors to the end of the pre-allocated space
// and allocate the entire free-space to the first inner-vector // and allocate the entire free-space to the first inner-vector

View File

@ -564,9 +564,9 @@ for (int iter = 0; iter < 10; ++iter) {
// Initialize the content of the memory pools to prevent asan from // Initialize the content of the memory pools to prevent asan from
// complaining. // complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.fill(a_, a_ + m_ * k_, T(12));
device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.fill(b_, b_ + k_ * n_, T(23));
device_.memset(c_, 31, m_ * n_ * sizeof(T)); device_.fill(c_, c_ + m_ * n_, T(31));
} }

View File

@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
// Initialize the content of the memory pools to prevent asan from // Initialize the content of the memory pools to prevent asan from
// complaining. // complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T)); device_.fill(a_, m_ * k_, T(12));
device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.fill(b_, k_ * n_, T(23));
device_.memset(c_, 31, m_ * n_ * sizeof(T)); device_.fill(c_, m_ * n_, T(31));
Eigen::array<TensorIndex, 2> sizeA; Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_; sizeA[0] = m_;

28
test/OffByOneScalar.h Normal file
View File

@ -0,0 +1,28 @@
// A Scalar with internal representation T+1 so that zero is internally
// represented by T(1). This is used to test memory fill.
//
template<typename T>
class OffByOneScalar {
public:
OffByOneScalar() : val_(1) {}
OffByOneScalar(const OffByOneScalar& other) {
*this = other;
}
OffByOneScalar& operator=(const OffByOneScalar& other) {
val_ = other.val_;
return *this;
}
OffByOneScalar(T val) : val_(val + 1) {}
OffByOneScalar& operator=(T val) {
val_ = val + 1;
}
operator T() const {
return val_ - 1;
}
private:
T val_;
};

View File

@ -762,7 +762,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
const Index resIncr(1); const Index resIncr(1);
// zero out the result buffer (which must be of size at least rows * sizeof(Scalar) // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
m_device.memset(buffer, 0, rows * sizeof(Scalar)); m_device.fill(buffer, buffer + rows, Scalar(0));
internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run( internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
rows, cols, lhs, rhs, rows, cols, lhs, rhs,
@ -869,7 +869,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
// If a contraction kernel does not support beta, explicitly initialize // If a contraction kernel does not support beta, explicitly initialize
// output buffer with zeroes. // output buffer with zeroes.
if (!TensorContractionKernel::HasBeta) { if (!TensorContractionKernel::HasBeta) {
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); this->m_device.fill(buffer, buffer + m * n, Scalar(0));
} }
for(Index i2=0; i2<m; i2+=mc) for(Index i2=0; i2<m; i2+=mc)

View File

@ -1370,8 +1370,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// columns in right side // columns in right side
const Index n = this->m_j_size; const Index n = this->m_j_size;
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); this->m_device.fill(buffer, buffer + m * n, Scalar(0));
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
LeftEvaluator, left_nocontract_t, LeftEvaluator, left_nocontract_t,

View File

@ -912,9 +912,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
// On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
// x m) row. Safe to do here because all kernels that will write to // x m) row. Safe to do here because all kernels that will write to
// this memory depend on completion of this task. Note: don't call // this memory depend on completion of this task. Note: don't call
// device_.memset() here. device_.memset() blocks on thread pool // device_.fill() here. device_.fill() blocks on thread pool
// worker thread, which can lead to underutilization and deadlocks. // worker thread, which can lead to underutilization and deadlocks.
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
} }
kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));

View File

@ -39,6 +39,17 @@ struct DefaultDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n); ::memset(buffer, c, n);
} }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
#ifdef EIGEN_GPU_COMPILE_PHASE
// std::fill is not a device function, so resort to simple loop.
for (T* it = begin; it != end; ++it) {
*it = value;
}
#else
std::fill(begin, end, value);
#endif
}
template<typename Type> template<typename Type>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
return data; return data;

View File

@ -281,10 +281,35 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess); gpu_assert(err == gpuSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(buffer)
EIGEN_UNUSED_VARIABLE(c)
EIGEN_UNUSED_VARIABLE(n)
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
#ifndef EIGEN_GPU_COMPILE_PHASE
const size_t count = end - begin;
// Split value into bytes and run memset with stride.
const int value_size = sizeof(value);
char* buffer = (char*)begin;
char* value_bytes = (char*)(&value);
gpuError_t err;
EIGEN_UNUSED_VARIABLE(err)
for (int b=0; b<value_size; ++b) {
err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
gpu_assert(err == gpuSuccess);
}
#else
EIGEN_UNUSED_VARIABLE(begin)
EIGEN_UNUSED_VARIABLE(end)
EIGEN_UNUSED_VARIABLE(value)
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE size_t numThreads() const { EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME // FIXME
return 32; return 32;

View File

@ -327,13 +327,27 @@ class QueueInterface {
if (n == 0) { if (n == 0) {
return; return;
} }
n /= sizeof(buffer_scalar_t);
auto f = [&](cl::sycl::handler &cgh) { auto f = [&](cl::sycl::handler &cgh) {
auto dst_acc = get_range_accessor<write_mode>(cgh, data, n); // Get a typed range accesser to ensure we fill each byte, in case
// The cast to uint8_t is here to match the behaviour of the standard // `buffer_scalar_t` is not (u)int8_t.
// memset. The cast to buffer_scalar_t is needed to match the type of the auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
// accessor (in case buffer_scalar_t is not uint8_t) cgh.fill(dst_acc, static_cast<uint8_t>(c));
cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c))); };
cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
async_synchronize(e);
}
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
static const auto write_mode = cl::sycl::access::mode::discard_write;
if (begin == end) {
return;
}
const ptrdiff_t count = end - begin;
auto f = [&](cl::sycl::handler &cgh) {
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
cgh.fill(dst_acc, value);
}; };
cl::sycl::event e; cl::sycl::event e;
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
@ -359,6 +373,8 @@ class QueueInterface {
auto original_buffer = pMapper.get_buffer(ptr); auto original_buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr); const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T); const ptrdiff_t typed_offset = offset / sizeof(T);
eigen_assert(typed_offset >= 0); eigen_assert(typed_offset >= 0);
const auto typed_size = original_buffer.get_size() / sizeof(T); const auto typed_size = original_buffer.get_size() / sizeof(T);
@ -395,6 +411,40 @@ class QueueInterface {
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
} }
/// Get a range accessor to the virtual pointer's device memory with a
/// specified type and count.
template <cl::sycl::access::mode AcMd, typename T, typename Index>
EIGEN_STRONG_INLINE cl::sycl::accessor<
T, 1, AcMd, cl::sycl::access::target::global_buffer>
get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
const Index count) const {
static const auto global_access = cl::sycl::access::target::global_buffer;
eigen_assert(count >= 0);
std::lock_guard<std::mutex> lock(pmapper_mutex_);
auto buffer = pMapper.get_buffer(ptr);
const ptrdiff_t offset = pMapper.get_offset(ptr);
eigen_assert(offset >= 0);
// Technically we should create a subbuffer for the desired range,
// then reinterpret that. However, I was not able to get changes to reflect
// in the original buffer (only the subbuffer and reinterpretted buffer).
// This current implementation now has the restriction that the buffer
// offset and original buffer size must be a multiple of sizeof(T).
// Note that get_range_accessor(void*) currently has the same restriction.
//
// auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer,
// cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
const ptrdiff_t typed_offset = offset / sizeof(T);
const size_t typed_size = buffer.get_size() / sizeof(T);
auto reint = buffer.template reinterpret<
typename Eigen::internal::remove_const<T>::type>(
cl::sycl::range<1>(typed_size));
return reint.template get_access<AcMd, global_access>(
cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
}
/// Creation of sycl accessor for a buffer. This function first tries to find /// Creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if /// the buffer in the buffer_map. If found it gets the accessor from it, if
/// not, the function then adds an entry by creating a sycl buffer for that /// not, the function then adds an entry by creating a sycl buffer for that
@ -912,15 +962,6 @@ struct SyclDevice : public SyclDeviceBase {
return queue_stream()->get(data); return queue_stream()->get(data);
} }
/// attach existing buffer
EIGEN_STRONG_INLINE void *attach_buffer(
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
return queue_stream()->attach_buffer(buf);
}
/// detach buffer
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
queue_stream()->detach_buffer(p);
}
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
return queue_stream()->get_offset(ptr); return queue_stream()->get_offset(ptr);
} }
@ -951,6 +992,11 @@ struct SyclDevice : public SyclDeviceBase {
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
queue_stream()->memset(data, c, n); queue_stream()->memset(data, c, n);
} }
/// the fill function
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
queue_stream()->fill(begin, end, value);
}
/// returning the sycl queue /// returning the sycl queue
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
return queue_stream()->sycl_queue(); return queue_stream()->sycl_queue();

View File

@ -122,6 +122,11 @@ struct ThreadPoolDevice {
::memset(buffer, c, n); ::memset(buffer, c, n);
} }
template<typename T>
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
std::fill(begin, end, value);
}
EIGEN_STRONG_INLINE int numThreads() const { EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_; return num_threads_;
} }

View File

@ -41,6 +41,7 @@
#define gpuMalloc hipMalloc #define gpuMalloc hipMalloc
#define gpuFree hipFree #define gpuFree hipFree
#define gpuMemsetAsync hipMemsetAsync #define gpuMemsetAsync hipMemsetAsync
#define gpuMemset2DAsync hipMemset2DAsync
#define gpuMemcpyAsync hipMemcpyAsync #define gpuMemcpyAsync hipMemcpyAsync
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
@ -71,6 +72,7 @@
#define gpuMalloc cudaMalloc #define gpuMalloc cudaMalloc
#define gpuFree cudaFree #define gpuFree cudaFree
#define gpuMemsetAsync cudaMemsetAsync #define gpuMemsetAsync cudaMemsetAsync
#define gpuMemset2DAsync cudaMemset2DAsync
#define gpuMemcpyAsync cudaMemcpyAsync #define gpuMemcpyAsync cudaMemcpyAsync
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost

View File

@ -26,6 +26,7 @@
#undef gpuMalloc #undef gpuMalloc
#undef gpuFree #undef gpuFree
#undef gpuMemsetAsync #undef gpuMemsetAsync
#undef gpuMemset2DAsync
#undef gpuMemcpyAsync #undef gpuMemcpyAsync
#undef gpuMemcpyDeviceToDevice #undef gpuMemcpyDeviceToDevice
#undef gpuMemcpyDeviceToHost #undef gpuMemcpyDeviceToHost

View File

@ -375,8 +375,8 @@ public:
/** Removes all non zeros */ /** Removes all non zeros */
inline void setZero() { inline void setZero() {
m_data.clear(); m_data.clear();
memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); std::fill_n(m_colStartIndex, m_outerSize + 1, Index(0));
memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); std::fill_n(m_rowStartIndex, m_outerSize + 1, Index(0));
} }
/** \returns the number of non zero coefficients */ /** \returns the number of non zero coefficients */
@ -435,7 +435,7 @@ public:
} }
//zeros new data //zeros new data
memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); std::fill_n(this->_upperPtr() + start, bandIncrement - 1, Scalar(0));
return m_data.upper(m_colStartIndex[inner]); return m_data.upper(m_colStartIndex[inner]);
} else { } else {
@ -466,7 +466,7 @@ public:
} }
//zeros new data //zeros new data
memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); std::fill_n(this->_lowerPtr() + start, bandIncrement - 1, Scalar(0));
return m_data.lower(m_rowStartIndex[outer]); return m_data.lower(m_rowStartIndex[outer]);
} else { } else {
return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
@ -493,7 +493,7 @@ public:
for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) { for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
m_rowStartIndex[innerIdx] += bandIncrement; m_rowStartIndex[innerIdx] += bandIncrement;
} }
memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); std::fill_n(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, bandIncrement - 1, Scalar(0));
return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner)); return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
} else { } else {
return m_data.upper(m_rowStartIndex[inner] + (outer - inner)); return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
@ -520,7 +520,7 @@ public:
for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) { for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
m_colStartIndex[innerIdx] += bandIncrement; m_colStartIndex[innerIdx] += bandIncrement;
} }
memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); std::fill_n(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, bandIncrement - 1, Scalar(0));
return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer)); return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
} else { } else {
return m_data.lower(m_colStartIndex[outer] + (inner - outer)); return m_data.lower(m_colStartIndex[outer] + (inner - outer));
@ -619,8 +619,8 @@ public:
m_data.clear(); m_data.clear();
m_outerSize = diagSize; m_outerSize = diagSize;
memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index)); std::fill_n(m_colStartIndex, cols + 1, Index(0));
memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index)); std::fill_n(m_rowStartIndex, rows + 1, Index(0));
} }
void resizeNonZeros(Index size) { void resizeNonZeros(Index size) {

View File

@ -187,11 +187,11 @@ public:
} }
inline void reset() { inline void reset() {
memset(m_diag, 0, m_diagSize * sizeof (Scalar)); std::fill_n(m_diag, m_diagSize, Scalar(0));
memset(m_upper, 0, m_upperSize * sizeof (Scalar)); std::fill_n(m_upper, m_upperSize, Scalar(0));
memset(m_lower, 0, m_lowerSize * sizeof (Scalar)); std::fill_n(m_lower, m_lowerSize, Scalar(0));
memset(m_upperProfile, 0, m_diagSize * sizeof (Index)); std::fill_n(m_upperProfile, m_diagSize, Index(0));
memset(m_lowerProfile, 0, m_diagSize * sizeof (Index)); std::fill_n(m_lowerProfile, m_diagSize, Index(0));
} }
void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) { void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {

View File

@ -25,10 +25,8 @@ static void test_1d()
vec1(4) = 23; vec2(4) = 4; vec1(4) = 23; vec2(4) = 4;
vec1(5) = 42; vec2(5) = 5; vec1(5) = 42; vec2(5) = 5;
int col_major[6]; int col_major[6] = {0};
int row_major[6]; int row_major[6] = {0};
memset(col_major, 0, 6*sizeof(int));
memset(row_major, 0, 6*sizeof(int));
TensorMap<Tensor<int, 1> > vec3(col_major, 6); TensorMap<Tensor<int, 1> > vec3(col_major, 6);
TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6); TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
@ -88,10 +86,8 @@ static void test_2d()
mat2(1,1) = 4; mat2(1,1) = 4;
mat2(1,2) = 5; mat2(1,2) = 5;
int col_major[6]; int col_major[6] = {0};
int row_major[6]; int row_major[6] = {0};
memset(col_major, 0, 6*sizeof(int));
memset(row_major, 0, 6*sizeof(int));
TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3); TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3); TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
@ -148,10 +144,8 @@ static void test_3d()
} }
} }
int col_major[2*3*7]; int col_major[2*3*7] = {0};
int row_major[2*3*7]; int row_major[2*3*7] = {0};
memset(col_major, 0, 2*3*7*sizeof(int));
memset(row_major, 0, 2*3*7*sizeof(int));
TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7); TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7); TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);

View File

@ -14,6 +14,7 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include "OffByOneScalar.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
@ -175,6 +176,44 @@ void test_3d_convolution(Context* context)
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
} }
// Helper method to synchronize device.
template<typename Device>
void synchronize(Device& device) { /*nothing*/ }
template<>
void synchronize(Eigen::GpuDevice& device) {
device.synchronize();
}
template <typename DataType, typename TensorDevice>
void test_device_memory(const TensorDevice& device) {
int count = 100;
Eigen::array<int, 1> tensorRange = {{count}};
Eigen::Tensor<DataType, 1> host(tensorRange);
Eigen::Tensor<DataType, 1> expected(tensorRange);
DataType* device_data = static_cast<DataType*>(device.allocate(count * sizeof(DataType)));
// memset
const char byte_value = static_cast<char>(0xAB);
device.memset(device_data, byte_value, count * sizeof(DataType));
device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
synchronize(device);
memset(expected.data(), byte_value, count * sizeof(DataType));
for (size_t i=0; i<count; i++) {
VERIFY_IS_EQUAL(host(i), expected(i));
}
// fill
DataType fill_value = DataType(7);
std::fill_n(expected.data(), count, fill_value);
device.fill(device_data, device_data + count, fill_value);
device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
synchronize(device);
for (int i=0; i<count; i++) {
VERIFY_IS_EQUAL(host(i), expected(i));
}
device.deallocate(device_data);
}
void test_cpu() { void test_cpu() {
Eigen::Tensor<float, 3> in1(40,50,70); Eigen::Tensor<float, 3> in1(40,50,70);
@ -266,6 +305,9 @@ void test_cpu() {
} }
} }
} }
test_device_memory<float>(context.device());
test_device_memory<OffByOneScalar<int>>(context.device());
} }
void test_gpu() { void test_gpu() {
@ -386,6 +428,8 @@ void test_gpu() {
#endif #endif
test_device_memory<float>(context.device());
test_device_memory<OffByOneScalar<int>>(context.device());
} }

View File

@ -18,26 +18,36 @@
#define EIGEN_USE_SYCL #define EIGEN_USE_SYCL
#include "main.h" #include "main.h"
#include "OffByOneScalar.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <stdint.h> #include <stdint.h>
#include <iostream> #include <iostream>
template <typename DataType, int DataLayout, typename IndexType> template <typename DataType, int DataLayout, typename IndexType>
void test_device_memory(const Eigen::SyclDevice &sycl_device) { void test_device_memory(const Eigen::SyclDevice &sycl_device) {
std::cout << "Running on : "
<< sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
<<std::endl;
IndexType sizeDim1 = 100; IndexType sizeDim1 = 100;
array<IndexType, 1> tensorRange = {{sizeDim1}}; array<IndexType, 1> tensorRange = {{sizeDim1}};
Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange); Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange); Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
memset(in1.data(), 1, in1.size() * sizeof(DataType));
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
// memset
memset(in1.data(), 1, in1.size() * sizeof(DataType));
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
for (IndexType i=0; i<in.size(); i++) { for (IndexType i=0; i<in.size(); i++) {
VERIFY_IS_EQUAL(in(i), in1(i)); VERIFY_IS_EQUAL(in(i), in1(i));
} }
// fill
DataType value = DataType(7);
std::fill_n(in1.data(), in1.size(), value);
sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
for (IndexType i=0; i<in.size(); i++) {
VERIFY_IS_EQUAL(in(i), in1(i));
}
sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_in_data);
} }
@ -73,5 +83,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
for (const auto& device :Eigen::get_sycl_supported_devices()) { for (const auto& device :Eigen::get_sycl_supported_devices()) {
CALL_SUBTEST(sycl_device_test_per_device<float>(device)); CALL_SUBTEST(sycl_device_test_per_device<float>(device));
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
} }
} }