diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index f50f839fc..d6e2ab1a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -11,7 +11,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #ifdef EIGEN_USE_THREADS -#include +#include " #endif namespace Eigen { @@ -28,45 +28,49 @@ namespace internal { // Default strategy: the expression is evaluated with a single cpu thread. template::PacketAccess> -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC static inline void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - - const Index size = evaluator.dimensions().TotalSize(); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } } - evaluator.cleanup(); } }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = unpacket_traits::PacketReturnType>::size; + const int VectorizedSize = (size / PacketSize) * PacketSize; - const Index size = evaluator.dimensions().TotalSize(); - static const int PacketSize = unpacket_traits::PacketReturnType>::size; - const int VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); + for (Index i = 0; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - evaluator.cleanup(); } }; @@ -107,38 +111,40 @@ struct EvalRange { }; template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; Evaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = evaluator.dimensions().TotalSize(); - const Index size = evaluator.dimensions().TotalSize(); + static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; + int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; + const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; + Index i = 0; + vector > results; + results.reserve(numblocks); + for (int i = 0; i < numblocks; ++i) { + results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + } - Index i = 0; - vector > results; - results.reserve(numblocks); - for (int i = 0; i < numblocks; ++i) { - results.push_back(std::async(std::launch::async, &EvalRange::run, &evaluator, i*blocksize, (i+1)*blocksize)); + for (int i = 0; i < numblocks; ++i) { + results[i].get(); + } + + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } } - - for (int i = 0; i < numblocks; ++i) { - results[i].get(); - } - - if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); - } - evaluator.cleanup(); } }; @@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) { } template -struct TensorExecutor +class TensorExecutor { + public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - evaluator.evalSubExprsIfNeeded(); - const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); - const int block_size = maxCudaThreadsPerBlock(); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); - const Index size = evaluator.dimensions().TotalSize(); - EigenMetaKernel > <<>>(evaluator, size); - eigen_assert(cudaGetLastError() == cudaSuccess); + const Index size = evaluator.dimensions().TotalSize(); + EigenMetaKernel > <<>>(evaluator, size); + assert(cudaGetLastError() == cudaSuccess); + } evaluator.cleanup(); } };