From 1fd5ce1002a6f30e1169b529b291216a18be2f7e Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 9 Jul 2021 11:53:11 -0700 Subject: [PATCH] For GpuDevice::fill, use a single memset if all bytes are equal. The original `fill` implementation introduced a 5x regression on my nvidia Quadro K1200. @rohitsan reported up to 100x regression for HIP. This restores performance. --- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index a9f951836..8ee4478cd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -298,9 +298,23 @@ struct GpuDevice { char* value_bytes = (char*)(&value); gpuError_t err; EIGEN_UNUSED_VARIABLE(err) - for (int b=0; bstream()); + + // If all value bytes are equal, then a single memset can be much faster. + bool use_single_memset = true; + for (int i=1; istream()); gpu_assert(err == gpuSuccess); + } else { + for (int b=0; bstream()); + gpu_assert(err == gpuSuccess); + } } #else EIGEN_UNUSED_VARIABLE(begin)