For GpuDevice::fill, use a single memset if all bytes are equal.

The original `fill` implementation introduced a 5x regression on my nvidia Quadro K1200. @rohitsan reported up to 100x regression for HIP. This restores performance.
2025-08-01 09:42:01 +08:00 · 2021-07-09 11:53:11 -07:00 · 2021-07-09 11:53:11 -07:00 · 1fd5ce1002
commit 1fd5ce1002
parent 9c22795d65
1 changed files with 16 additions and 2 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -298,9 +298,23 @@ struct GpuDevice {
    char* value_bytes = (char*)(&value);
    gpuError_t err;
    EIGEN_UNUSED_VARIABLE(err)
-    for (int b=0; b<value_size; ++b) {
-      err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
+    
+    // If all value bytes are equal, then a single memset can be much faster.
+    bool use_single_memset = true;
+    for (int i=1; i<value_size; ++i) {
+      if (value_bytes[i] != value_bytes[0]) {
+        use_single_memset = false;
+      } 
+    }
+    
+    if (use_single_memset) {
+      err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
      gpu_assert(err == gpuSuccess);
+    } else {
+      for (int b=0; b<value_size; ++b) {
+        err = gpuMemset2DAsync(buffer+b, value_size, value_bytes[b], 1, count, stream_->stream());
+        gpu_assert(err == gpuSuccess);
+      }
    }
 #else
    EIGEN_UNUSED_VARIABLE(begin)