diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index a9f951836..8ee4478cd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -298,9 +298,23 @@ struct GpuDevice { char* value_bytes = (char*)(&value); gpuError_t err; EIGEN_UNUSED_VARIABLE(err) - for (int b=0; bstream()); + + // If all value bytes are equal, then a single memset can be much faster. + bool use_single_memset = true; + for (int i=1; istream()); gpu_assert(err == gpuSuccess); + } else { + for (int b=0; bstream()); + gpu_assert(err == gpuSuccess); + } } #else EIGEN_UNUSED_VARIABLE(begin)