mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-23 23:03:15 +08:00
Replace memset with fill to work for non-trivial scalars.
For custom scalars, zero is not necessarily represented by a zeroed-out memory block (e.g. gnu MPFR). We therefore cannot rely on `memset` if we want to fill a matrix or tensor with zeroes. Instead, we should rely on `fill`, which for trivial types does end up getting converted to a `memset` under-the-hood (at least with gcc/clang). Requires adding a `fill(begin, end, v)` to `TensorDevice`. Replaced all potentially bad instances of memset with fill. Fixes #2245.
This commit is contained in:
parent
e9c9a3130b
commit
1e6c6c1576
@ -253,9 +253,10 @@ class SparseMatrix
|
|||||||
inline void setZero()
|
inline void setZero()
|
||||||
{
|
{
|
||||||
m_data.clear();
|
m_data.clear();
|
||||||
memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
|
std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
|
||||||
if(m_innerNonZeros)
|
if(m_innerNonZeros) {
|
||||||
memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
|
std::fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Preallocates \a reserveSize non zeros.
|
/** Preallocates \a reserveSize non zeros.
|
||||||
@ -641,7 +642,7 @@ class SparseMatrix
|
|||||||
std::free(m_innerNonZeros);
|
std::free(m_innerNonZeros);
|
||||||
m_innerNonZeros = 0;
|
m_innerNonZeros = 0;
|
||||||
}
|
}
|
||||||
memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
|
std::fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \internal
|
/** \internal
|
||||||
@ -1260,7 +1261,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
|
|||||||
m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
|
m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
|
||||||
if(!m_innerNonZeros) internal::throw_std_bad_alloc();
|
if(!m_innerNonZeros) internal::throw_std_bad_alloc();
|
||||||
|
|
||||||
memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
|
std::fill(m_innerNonZeros, m_innerNonZeros + m_outerSize, StorageIndex(0));
|
||||||
|
|
||||||
// pack all inner-vectors to the end of the pre-allocated space
|
// pack all inner-vectors to the end of the pre-allocated space
|
||||||
// and allocate the entire free-space to the first inner-vector
|
// and allocate the entire free-space to the first inner-vector
|
||||||
|
@ -564,9 +564,9 @@ for (int iter = 0; iter < 10; ++iter) {
|
|||||||
|
|
||||||
// Initialize the content of the memory pools to prevent asan from
|
// Initialize the content of the memory pools to prevent asan from
|
||||||
// complaining.
|
// complaining.
|
||||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
device_.fill(a_, a_ + m_ * k_, T(12));
|
||||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
device_.fill(b_, b_ + k_ * n_, T(23));
|
||||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
device_.fill(c_, c_ + m_ * n_, T(31));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
|
|||||||
|
|
||||||
// Initialize the content of the memory pools to prevent asan from
|
// Initialize the content of the memory pools to prevent asan from
|
||||||
// complaining.
|
// complaining.
|
||||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
device_.fill(a_, m_ * k_, T(12));
|
||||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
device_.fill(b_, k_ * n_, T(23));
|
||||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
device_.fill(c_, m_ * n_, T(31));
|
||||||
|
|
||||||
Eigen::array<TensorIndex, 2> sizeA;
|
Eigen::array<TensorIndex, 2> sizeA;
|
||||||
sizeA[0] = m_;
|
sizeA[0] = m_;
|
||||||
|
28
test/OffByOneScalar.h
Normal file
28
test/OffByOneScalar.h
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
// A Scalar with internal representation T+1 so that zero is internally
|
||||||
|
// represented by T(1). This is used to test memory fill.
|
||||||
|
//
|
||||||
|
template<typename T>
|
||||||
|
class OffByOneScalar {
|
||||||
|
public:
|
||||||
|
OffByOneScalar() : val_(1) {}
|
||||||
|
OffByOneScalar(const OffByOneScalar& other) {
|
||||||
|
*this = other;
|
||||||
|
}
|
||||||
|
OffByOneScalar& operator=(const OffByOneScalar& other) {
|
||||||
|
val_ = other.val_;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
OffByOneScalar(T val) : val_(val + 1) {}
|
||||||
|
OffByOneScalar& operator=(T val) {
|
||||||
|
val_ = val + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
operator T() const {
|
||||||
|
return val_ - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
T val_;
|
||||||
|
};
|
@ -762,7 +762,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
|
|||||||
const Index resIncr(1);
|
const Index resIncr(1);
|
||||||
|
|
||||||
// zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
|
// zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
|
||||||
m_device.memset(buffer, 0, rows * sizeof(Scalar));
|
m_device.fill(buffer, buffer + rows, Scalar(0));
|
||||||
|
|
||||||
internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
|
internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
|
||||||
rows, cols, lhs, rhs,
|
rows, cols, lhs, rhs,
|
||||||
@ -869,7 +869,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
|
|||||||
// If a contraction kernel does not support beta, explicitly initialize
|
// If a contraction kernel does not support beta, explicitly initialize
|
||||||
// output buffer with zeroes.
|
// output buffer with zeroes.
|
||||||
if (!TensorContractionKernel::HasBeta) {
|
if (!TensorContractionKernel::HasBeta) {
|
||||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
this->m_device.fill(buffer, buffer + m * n, Scalar(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
for(Index i2=0; i2<m; i2+=mc)
|
for(Index i2=0; i2<m; i2+=mc)
|
||||||
|
@ -1370,8 +1370,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// columns in right side
|
// columns in right side
|
||||||
const Index n = this->m_j_size;
|
const Index n = this->m_j_size;
|
||||||
|
|
||||||
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
|
||||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
this->m_device.fill(buffer, buffer + m * n, Scalar(0));
|
||||||
|
|
||||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||||
LeftEvaluator, left_nocontract_t,
|
LeftEvaluator, left_nocontract_t,
|
||||||
|
@ -912,9 +912,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
|
// On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
|
||||||
// x m) row. Safe to do here because all kernels that will write to
|
// x m) row. Safe to do here because all kernels that will write to
|
||||||
// this memory depend on completion of this task. Note: don't call
|
// this memory depend on completion of this task. Note: don't call
|
||||||
// device_.memset() here. device_.memset() blocks on thread pool
|
// device_.fill() here. device_.fill() blocks on thread pool
|
||||||
// worker thread, which can lead to underutilization and deadlocks.
|
// worker thread, which can lead to underutilization and deadlocks.
|
||||||
memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
|
std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
|
||||||
}
|
}
|
||||||
kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
|
kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
|
||||||
rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
|
rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
|
||||||
|
@ -39,6 +39,17 @@ struct DefaultDevice {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||||
::memset(buffer, c, n);
|
::memset(buffer, c, n);
|
||||||
}
|
}
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||||
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
|
// std::fill is not a device function, so resort to simple loop.
|
||||||
|
for (T* it = begin; it != end; ++it) {
|
||||||
|
*it = value;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
std::fill(begin, end, value);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
template<typename Type>
|
template<typename Type>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
|
||||||
return data;
|
return data;
|
||||||
|
@ -281,6 +281,31 @@ struct GpuDevice {
|
|||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
gpu_assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
#else
|
#else
|
||||||
|
EIGEN_UNUSED_VARIABLE(buffer)
|
||||||
|
EIGEN_UNUSED_VARIABLE(c)
|
||||||
|
EIGEN_UNUSED_VARIABLE(n)
|
||||||
|
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||||
|
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||||
|
const size_t count = end - begin;
|
||||||
|
// Split value into bytes and run memset with stride.
|
||||||
|
const int value_size = sizeof(value);
|
||||||
|
char* buffer = (char*)begin;
|
||||||
|
char* value_bytes = (char*)(&value);
|
||||||
|
gpuError_t err;
|
||||||
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
|
for (int b=0; b<value_size; ++b) {
|
||||||
|
err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
|
||||||
|
gpu_assert(err == gpuSuccess);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
EIGEN_UNUSED_VARIABLE(begin)
|
||||||
|
EIGEN_UNUSED_VARIABLE(end)
|
||||||
|
EIGEN_UNUSED_VARIABLE(value)
|
||||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -327,13 +327,27 @@ class QueueInterface {
|
|||||||
if (n == 0) {
|
if (n == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
n /= sizeof(buffer_scalar_t);
|
|
||||||
auto f = [&](cl::sycl::handler &cgh) {
|
auto f = [&](cl::sycl::handler &cgh) {
|
||||||
auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
|
// Get a typed range accesser to ensure we fill each byte, in case
|
||||||
// The cast to uint8_t is here to match the behaviour of the standard
|
// `buffer_scalar_t` is not (u)int8_t.
|
||||||
// memset. The cast to buffer_scalar_t is needed to match the type of the
|
auto dst_acc = get_typed_range_accessor<write_mode, uint8_t>(cgh, data, n);
|
||||||
// accessor (in case buffer_scalar_t is not uint8_t)
|
cgh.fill(dst_acc, static_cast<uint8_t>(c));
|
||||||
cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
|
};
|
||||||
|
cl::sycl::event e;
|
||||||
|
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||||
|
async_synchronize(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||||
|
static const auto write_mode = cl::sycl::access::mode::discard_write;
|
||||||
|
if (begin == end) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const ptrdiff_t count = end - begin;
|
||||||
|
auto f = [&](cl::sycl::handler &cgh) {
|
||||||
|
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
|
||||||
|
cgh.fill(dst_acc, value);
|
||||||
};
|
};
|
||||||
cl::sycl::event e;
|
cl::sycl::event e;
|
||||||
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
|
||||||
@ -359,6 +373,8 @@ class QueueInterface {
|
|||||||
|
|
||||||
auto original_buffer = pMapper.get_buffer(ptr);
|
auto original_buffer = pMapper.get_buffer(ptr);
|
||||||
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
||||||
|
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
|
||||||
|
eigen_assert(original_buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
|
||||||
const ptrdiff_t typed_offset = offset / sizeof(T);
|
const ptrdiff_t typed_offset = offset / sizeof(T);
|
||||||
eigen_assert(typed_offset >= 0);
|
eigen_assert(typed_offset >= 0);
|
||||||
const auto typed_size = original_buffer.get_size() / sizeof(T);
|
const auto typed_size = original_buffer.get_size() / sizeof(T);
|
||||||
@ -395,6 +411,40 @@ class QueueInterface {
|
|||||||
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
|
cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get a range accessor to the virtual pointer's device memory with a
|
||||||
|
/// specified type and count.
|
||||||
|
template <cl::sycl::access::mode AcMd, typename T, typename Index>
|
||||||
|
EIGEN_STRONG_INLINE cl::sycl::accessor<
|
||||||
|
T, 1, AcMd, cl::sycl::access::target::global_buffer>
|
||||||
|
get_typed_range_accessor(cl::sycl::handler &cgh, const void *ptr,
|
||||||
|
const Index count) const {
|
||||||
|
static const auto global_access = cl::sycl::access::target::global_buffer;
|
||||||
|
eigen_assert(count >= 0);
|
||||||
|
std::lock_guard<std::mutex> lock(pmapper_mutex_);
|
||||||
|
auto buffer = pMapper.get_buffer(ptr);
|
||||||
|
const ptrdiff_t offset = pMapper.get_offset(ptr);
|
||||||
|
eigen_assert(offset >= 0);
|
||||||
|
|
||||||
|
// Technically we should create a subbuffer for the desired range,
|
||||||
|
// then reinterpret that. However, I was not able to get changes to reflect
|
||||||
|
// in the original buffer (only the subbuffer and reinterpretted buffer).
|
||||||
|
// This current implementation now has the restriction that the buffer
|
||||||
|
// offset and original buffer size must be a multiple of sizeof(T).
|
||||||
|
// Note that get_range_accessor(void*) currently has the same restriction.
|
||||||
|
//
|
||||||
|
// auto subbuffer = cl::sycl::buffer<buffer_scalar_t, 1>(buffer,
|
||||||
|
// cl::sycl::id<1>(offset), cl::sycl::range<1>(n_bytes));
|
||||||
|
eigen_assert(offset % sizeof(T) == 0 && "The offset must be a multiple of sizeof(T)");
|
||||||
|
eigen_assert(buffer.get_size() % sizeof(T) == 0 && "The buffer size must be a multiple of sizeof(T)");
|
||||||
|
const ptrdiff_t typed_offset = offset / sizeof(T);
|
||||||
|
const size_t typed_size = buffer.get_size() / sizeof(T);
|
||||||
|
auto reint = buffer.template reinterpret<
|
||||||
|
typename Eigen::internal::remove_const<T>::type>(
|
||||||
|
cl::sycl::range<1>(typed_size));
|
||||||
|
return reint.template get_access<AcMd, global_access>(
|
||||||
|
cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(typed_offset));
|
||||||
|
}
|
||||||
|
|
||||||
/// Creation of sycl accessor for a buffer. This function first tries to find
|
/// Creation of sycl accessor for a buffer. This function first tries to find
|
||||||
/// the buffer in the buffer_map. If found it gets the accessor from it, if
|
/// the buffer in the buffer_map. If found it gets the accessor from it, if
|
||||||
/// not, the function then adds an entry by creating a sycl buffer for that
|
/// not, the function then adds an entry by creating a sycl buffer for that
|
||||||
@ -912,15 +962,6 @@ struct SyclDevice : public SyclDeviceBase {
|
|||||||
return queue_stream()->get(data);
|
return queue_stream()->get(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// attach existing buffer
|
|
||||||
EIGEN_STRONG_INLINE void *attach_buffer(
|
|
||||||
cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
|
|
||||||
return queue_stream()->attach_buffer(buf);
|
|
||||||
}
|
|
||||||
/// detach buffer
|
|
||||||
EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
|
|
||||||
queue_stream()->detach_buffer(p);
|
|
||||||
}
|
|
||||||
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
|
EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
|
||||||
return queue_stream()->get_offset(ptr);
|
return queue_stream()->get_offset(ptr);
|
||||||
}
|
}
|
||||||
@ -951,6 +992,11 @@ struct SyclDevice : public SyclDeviceBase {
|
|||||||
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
||||||
queue_stream()->memset(data, c, n);
|
queue_stream()->memset(data, c, n);
|
||||||
}
|
}
|
||||||
|
/// the fill function
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||||
|
queue_stream()->fill(begin, end, value);
|
||||||
|
}
|
||||||
/// returning the sycl queue
|
/// returning the sycl queue
|
||||||
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
|
EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
|
||||||
return queue_stream()->sycl_queue();
|
return queue_stream()->sycl_queue();
|
||||||
|
@ -122,6 +122,11 @@ struct ThreadPoolDevice {
|
|||||||
::memset(buffer, c, n);
|
::memset(buffer, c, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
|
||||||
|
std::fill(begin, end, value);
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE int numThreads() const {
|
EIGEN_STRONG_INLINE int numThreads() const {
|
||||||
return num_threads_;
|
return num_threads_;
|
||||||
}
|
}
|
||||||
|
@ -41,6 +41,7 @@
|
|||||||
#define gpuMalloc hipMalloc
|
#define gpuMalloc hipMalloc
|
||||||
#define gpuFree hipFree
|
#define gpuFree hipFree
|
||||||
#define gpuMemsetAsync hipMemsetAsync
|
#define gpuMemsetAsync hipMemsetAsync
|
||||||
|
#define gpuMemset2DAsync hipMemset2DAsync
|
||||||
#define gpuMemcpyAsync hipMemcpyAsync
|
#define gpuMemcpyAsync hipMemcpyAsync
|
||||||
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
|
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
@ -71,6 +72,7 @@
|
|||||||
#define gpuMalloc cudaMalloc
|
#define gpuMalloc cudaMalloc
|
||||||
#define gpuFree cudaFree
|
#define gpuFree cudaFree
|
||||||
#define gpuMemsetAsync cudaMemsetAsync
|
#define gpuMemsetAsync cudaMemsetAsync
|
||||||
|
#define gpuMemset2DAsync cudaMemset2DAsync
|
||||||
#define gpuMemcpyAsync cudaMemcpyAsync
|
#define gpuMemcpyAsync cudaMemcpyAsync
|
||||||
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
|
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
|
||||||
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
|
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
|
||||||
|
@ -26,6 +26,7 @@
|
|||||||
#undef gpuMalloc
|
#undef gpuMalloc
|
||||||
#undef gpuFree
|
#undef gpuFree
|
||||||
#undef gpuMemsetAsync
|
#undef gpuMemsetAsync
|
||||||
|
#undef gpuMemset2DAsync
|
||||||
#undef gpuMemcpyAsync
|
#undef gpuMemcpyAsync
|
||||||
#undef gpuMemcpyDeviceToDevice
|
#undef gpuMemcpyDeviceToDevice
|
||||||
#undef gpuMemcpyDeviceToHost
|
#undef gpuMemcpyDeviceToHost
|
||||||
|
@ -375,8 +375,8 @@ public:
|
|||||||
/** Removes all non zeros */
|
/** Removes all non zeros */
|
||||||
inline void setZero() {
|
inline void setZero() {
|
||||||
m_data.clear();
|
m_data.clear();
|
||||||
memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
|
std::fill_n(m_colStartIndex, m_outerSize + 1, Index(0));
|
||||||
memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
|
std::fill_n(m_rowStartIndex, m_outerSize + 1, Index(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \returns the number of non zero coefficients */
|
/** \returns the number of non zero coefficients */
|
||||||
@ -435,7 +435,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
//zeros new data
|
//zeros new data
|
||||||
memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
|
std::fill_n(this->_upperPtr() + start, bandIncrement - 1, Scalar(0));
|
||||||
|
|
||||||
return m_data.upper(m_colStartIndex[inner]);
|
return m_data.upper(m_colStartIndex[inner]);
|
||||||
} else {
|
} else {
|
||||||
@ -466,7 +466,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
//zeros new data
|
//zeros new data
|
||||||
memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
|
std::fill_n(this->_lowerPtr() + start, bandIncrement - 1, Scalar(0));
|
||||||
return m_data.lower(m_rowStartIndex[outer]);
|
return m_data.lower(m_rowStartIndex[outer]);
|
||||||
} else {
|
} else {
|
||||||
return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
|
return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
|
||||||
@ -493,7 +493,7 @@ public:
|
|||||||
for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
|
for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
|
||||||
m_rowStartIndex[innerIdx] += bandIncrement;
|
m_rowStartIndex[innerIdx] += bandIncrement;
|
||||||
}
|
}
|
||||||
memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
|
std::fill_n(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, bandIncrement - 1, Scalar(0));
|
||||||
return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
|
return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
|
||||||
} else {
|
} else {
|
||||||
return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
|
return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
|
||||||
@ -520,7 +520,7 @@ public:
|
|||||||
for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
|
for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
|
||||||
m_colStartIndex[innerIdx] += bandIncrement;
|
m_colStartIndex[innerIdx] += bandIncrement;
|
||||||
}
|
}
|
||||||
memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
|
std::fill_n(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, bandIncrement - 1, Scalar(0));
|
||||||
return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
|
return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
|
||||||
} else {
|
} else {
|
||||||
return m_data.lower(m_colStartIndex[outer] + (inner - outer));
|
return m_data.lower(m_colStartIndex[outer] + (inner - outer));
|
||||||
@ -619,8 +619,8 @@ public:
|
|||||||
m_data.clear();
|
m_data.clear();
|
||||||
|
|
||||||
m_outerSize = diagSize;
|
m_outerSize = diagSize;
|
||||||
memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
|
std::fill_n(m_colStartIndex, cols + 1, Index(0));
|
||||||
memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
|
std::fill_n(m_rowStartIndex, rows + 1, Index(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
void resizeNonZeros(Index size) {
|
void resizeNonZeros(Index size) {
|
||||||
|
@ -187,11 +187,11 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline void reset() {
|
inline void reset() {
|
||||||
memset(m_diag, 0, m_diagSize * sizeof (Scalar));
|
std::fill_n(m_diag, m_diagSize, Scalar(0));
|
||||||
memset(m_upper, 0, m_upperSize * sizeof (Scalar));
|
std::fill_n(m_upper, m_upperSize, Scalar(0));
|
||||||
memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
|
std::fill_n(m_lower, m_lowerSize, Scalar(0));
|
||||||
memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
|
std::fill_n(m_upperProfile, m_diagSize, Index(0));
|
||||||
memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
|
std::fill_n(m_lowerProfile, m_diagSize, Index(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
|
void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
|
||||||
|
@ -25,10 +25,8 @@ static void test_1d()
|
|||||||
vec1(4) = 23; vec2(4) = 4;
|
vec1(4) = 23; vec2(4) = 4;
|
||||||
vec1(5) = 42; vec2(5) = 5;
|
vec1(5) = 42; vec2(5) = 5;
|
||||||
|
|
||||||
int col_major[6];
|
int col_major[6] = {0};
|
||||||
int row_major[6];
|
int row_major[6] = {0};
|
||||||
memset(col_major, 0, 6*sizeof(int));
|
|
||||||
memset(row_major, 0, 6*sizeof(int));
|
|
||||||
TensorMap<Tensor<int, 1> > vec3(col_major, 6);
|
TensorMap<Tensor<int, 1> > vec3(col_major, 6);
|
||||||
TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
|
TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
|
||||||
|
|
||||||
@ -88,10 +86,8 @@ static void test_2d()
|
|||||||
mat2(1,1) = 4;
|
mat2(1,1) = 4;
|
||||||
mat2(1,2) = 5;
|
mat2(1,2) = 5;
|
||||||
|
|
||||||
int col_major[6];
|
int col_major[6] = {0};
|
||||||
int row_major[6];
|
int row_major[6] = {0};
|
||||||
memset(col_major, 0, 6*sizeof(int));
|
|
||||||
memset(row_major, 0, 6*sizeof(int));
|
|
||||||
TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
|
TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
|
||||||
TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
|
TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
|
||||||
|
|
||||||
@ -148,10 +144,8 @@ static void test_3d()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int col_major[2*3*7];
|
int col_major[2*3*7] = {0};
|
||||||
int row_major[2*3*7];
|
int row_major[2*3*7] = {0};
|
||||||
memset(col_major, 0, 2*3*7*sizeof(int));
|
|
||||||
memset(row_major, 0, 2*3*7*sizeof(int));
|
|
||||||
TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
|
TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
|
||||||
TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
|
TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
|
||||||
|
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
#include "OffByOneScalar.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
|
|
||||||
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
|
||||||
@ -175,6 +176,44 @@ void test_3d_convolution(Context* context)
|
|||||||
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
|
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper method to synchronize device.
|
||||||
|
template<typename Device>
|
||||||
|
void synchronize(Device& device) { /*nothing*/ }
|
||||||
|
template<>
|
||||||
|
void synchronize(Eigen::GpuDevice& device) {
|
||||||
|
device.synchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename DataType, typename TensorDevice>
|
||||||
|
void test_device_memory(const TensorDevice& device) {
|
||||||
|
int count = 100;
|
||||||
|
Eigen::array<int, 1> tensorRange = {{count}};
|
||||||
|
Eigen::Tensor<DataType, 1> host(tensorRange);
|
||||||
|
Eigen::Tensor<DataType, 1> expected(tensorRange);
|
||||||
|
DataType* device_data = static_cast<DataType*>(device.allocate(count * sizeof(DataType)));
|
||||||
|
|
||||||
|
// memset
|
||||||
|
const char byte_value = static_cast<char>(0xAB);
|
||||||
|
device.memset(device_data, byte_value, count * sizeof(DataType));
|
||||||
|
device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
|
||||||
|
synchronize(device);
|
||||||
|
memset(expected.data(), byte_value, count * sizeof(DataType));
|
||||||
|
for (size_t i=0; i<count; i++) {
|
||||||
|
VERIFY_IS_EQUAL(host(i), expected(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill
|
||||||
|
DataType fill_value = DataType(7);
|
||||||
|
std::fill_n(expected.data(), count, fill_value);
|
||||||
|
device.fill(device_data, device_data + count, fill_value);
|
||||||
|
device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType));
|
||||||
|
synchronize(device);
|
||||||
|
for (int i=0; i<count; i++) {
|
||||||
|
VERIFY_IS_EQUAL(host(i), expected(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
device.deallocate(device_data);
|
||||||
|
}
|
||||||
|
|
||||||
void test_cpu() {
|
void test_cpu() {
|
||||||
Eigen::Tensor<float, 3> in1(40,50,70);
|
Eigen::Tensor<float, 3> in1(40,50,70);
|
||||||
@ -266,6 +305,9 @@ void test_cpu() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test_device_memory<float>(context.device());
|
||||||
|
test_device_memory<OffByOneScalar<int>>(context.device());
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_gpu() {
|
void test_gpu() {
|
||||||
@ -386,6 +428,8 @@ void test_gpu() {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
test_device_memory<float>(context.device());
|
||||||
|
test_device_memory<OffByOneScalar<int>>(context.device());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,26 +18,36 @@
|
|||||||
#define EIGEN_USE_SYCL
|
#define EIGEN_USE_SYCL
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
#include "OffByOneScalar.h"
|
||||||
#include <unsupported/Eigen/CXX11/Tensor>
|
#include <unsupported/Eigen/CXX11/Tensor>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
template <typename DataType, int DataLayout, typename IndexType>
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||||
std::cout << "Running on : "
|
|
||||||
<< sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
|
|
||||||
<<std::endl;
|
|
||||||
IndexType sizeDim1 = 100;
|
IndexType sizeDim1 = 100;
|
||||||
array<IndexType, 1> tensorRange = {{sizeDim1}};
|
array<IndexType, 1> tensorRange = {{sizeDim1}};
|
||||||
Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
|
Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
|
||||||
Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
|
Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
|
||||||
memset(in1.data(), 1, in1.size() * sizeof(DataType));
|
|
||||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
|
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
|
||||||
|
|
||||||
|
// memset
|
||||||
|
memset(in1.data(), 1, in1.size() * sizeof(DataType));
|
||||||
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
|
sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
|
||||||
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
|
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
|
||||||
for (IndexType i=0; i<in.size(); i++) {
|
for (IndexType i=0; i<in.size(); i++) {
|
||||||
VERIFY_IS_EQUAL(in(i), in1(i));
|
VERIFY_IS_EQUAL(in(i), in1(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fill
|
||||||
|
DataType value = DataType(7);
|
||||||
|
std::fill_n(in1.data(), in1.size(), value);
|
||||||
|
sycl_device.fill(gpu_in_data, gpu_in_data + in.size(), value);
|
||||||
|
sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
|
||||||
|
for (IndexType i=0; i<in.size(); i++) {
|
||||||
|
VERIFY_IS_EQUAL(in(i), in1(i));
|
||||||
|
}
|
||||||
|
|
||||||
sycl_device.deallocate(gpu_in_data);
|
sycl_device.deallocate(gpu_in_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,5 +83,6 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
|
|||||||
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
|
EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) {
|
||||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||||
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
CALL_SUBTEST(sycl_device_test_per_device<float>(device));
|
||||||
|
CALL_SUBTEST(sycl_device_test_per_device<OffByOneScalar<int>>(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user