From 2451679951cf6befb69204c211e4d84902dd86e4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 19 May 2015 15:19:01 -0700 Subject: [PATCH] Avoid using the cuda memcpy for small tensor slices since the memcpy kernel is very expensive to launch --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d15055727..fa1e6931c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -283,6 +283,26 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } +}; +#endif +} + // Eval as rvalue template struct TensorEvaluator, Device> @@ -364,7 +384,8 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - if (contiguous_values > static_cast(2 * m_device.numThreads())) { + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(contiguous_values)) { Scalar* src = m_impl.data(); for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i);