For GpuDevice::fill, use a single memset if all bytes are equal.

The original `fill` implementation introduced a 5x regression on my
nvidia Quadro K1200.  @rohitsan reported up to 100x regression for
HIP.  This restores performance.
This commit is contained in:
Antonio Sanchez 2021-07-09 11:53:11 -07:00 committed by Antonio Sánchez
parent 9c22795d65
commit 1fd5ce1002

View File

@ -298,9 +298,23 @@ struct GpuDevice {
char* value_bytes = (char*)(&value);
gpuError_t err;
EIGEN_UNUSED_VARIABLE(err)
for (int b=0; b<value_size; ++b) {
err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
// If all value bytes are equal, then a single memset can be much faster.
bool use_single_memset = true;
for (int i=1; i<value_size; ++i) {
if (value_bytes[i] != value_bytes[0]) {
use_single_memset = false;
}
}
if (use_single_memset) {
err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
gpu_assert(err == gpuSuccess);
} else {
for (int b=0; b<value_size; ++b) {
err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
gpu_assert(err == gpuSuccess);
}
}
#else
EIGEN_UNUSED_VARIABLE(begin)