Reworked the TensorExecutor code to support in place evaluation.

This commit is contained in:
Benoit Steiner 2014-08-13 08:22:05 -07:00
parent 647622281e
commit 439feca139

View File

@ -11,7 +11,7 @@
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#ifdef EIGEN_USE_THREADS #ifdef EIGEN_USE_THREADS
#include <future> #include <future>"
#endif #endif
namespace Eigen { namespace Eigen {
@ -28,45 +28,49 @@ namespace internal {
// Default strategy: the expression is evaluated with a single cpu thread. // Default strategy: the expression is evaluated with a single cpu thread.
template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess> template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = TensorEvaluator<Expression, Device>::PacketAccess>
struct TensorExecutor class TensorExecutor
{ {
public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const Device& device = Device()) static inline void run(const Expression& expr, const Device& device = Device())
{ {
TensorEvaluator<Expression, Device> evaluator(expr, device); TensorEvaluator<Expression, Device> evaluator(expr, device);
evaluator.evalSubExprsIfNeeded(); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
const Index size = evaluator.dimensions().TotalSize(); {
for (Index i = 0; i < size; ++i) { const Index size = evaluator.dimensions().TotalSize();
evaluator.evalScalar(i); for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
} }
evaluator.cleanup(); evaluator.cleanup();
} }
}; };
template<typename Expression> template<typename Expression>
struct TensorExecutor<Expression, DefaultDevice, true> class TensorExecutor<Expression, DefaultDevice, true>
{ {
public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{ {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
evaluator.evalSubExprsIfNeeded(); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = evaluator.dimensions().TotalSize();
static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
const int VectorizedSize = (size / PacketSize) * PacketSize;
const Index size = evaluator.dimensions().TotalSize(); for (Index i = 0; i < VectorizedSize; i += PacketSize) {
static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; evaluator.evalPacket(i);
const int VectorizedSize = (size / PacketSize) * PacketSize; }
for (Index i = VectorizedSize; i < size; ++i) {
for (Index i = 0; i < VectorizedSize; i += PacketSize) { evaluator.evalScalar(i);
evaluator.evalPacket(i); }
} }
for (Index i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
evaluator.cleanup(); evaluator.cleanup();
} }
}; };
@ -107,38 +111,40 @@ struct EvalRange<Evaluator, Index, true> {
}; };
template<typename Expression, bool Vectorizable> template<typename Expression, bool Vectorizable>
struct TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
{ {
public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device) static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{ {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
Evaluator evaluator(expr, device); Evaluator evaluator(expr, device);
evaluator.evalSubExprsIfNeeded(); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = evaluator.dimensions().TotalSize();
const Index size = evaluator.dimensions().TotalSize(); static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1; Index i = 0;
const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); vector<std::future<void> > results;
const Index numblocks = size / blocksize; results.reserve(numblocks);
for (int i = 0; i < numblocks; ++i) {
results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize));
}
Index i = 0; for (int i = 0; i < numblocks; ++i) {
vector<std::future<void> > results; results[i].get();
results.reserve(numblocks); }
for (int i = 0; i < numblocks; ++i) {
results.push_back(std::async(std::launch::async, &EvalRange<Evaluator, Index>::run, &evaluator, i*blocksize, (i+1)*blocksize)); if (numblocks * blocksize < size) {
EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size);
}
} }
for (int i = 0; i < numblocks; ++i) {
results[i].get();
}
if (numblocks * blocksize < size) {
EvalRange<Evaluator, Index>::run(&evaluator, numblocks * blocksize, size);
}
evaluator.cleanup(); evaluator.cleanup();
} }
}; };
@ -157,19 +163,23 @@ __global__ void EigenMetaKernel(Evaluator eval, unsigned int size) {
} }
template<typename Expression, bool Vectorizable> template<typename Expression, bool Vectorizable>
struct TensorExecutor<Expression, GpuDevice, Vectorizable> class TensorExecutor<Expression, GpuDevice, Vectorizable>
{ {
public:
typedef typename Expression::Index Index; typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device) static inline void run(const Expression& expr, const GpuDevice& device)
{ {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
evaluator.evalSubExprsIfNeeded(); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); if (needs_assign)
const int block_size = maxCudaThreadsPerBlock(); {
const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
const int block_size = maxCudaThreadsPerBlock();
const Index size = evaluator.dimensions().TotalSize(); const Index size = evaluator.dimensions().TotalSize();
EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size); EigenMetaKernel<TensorEvaluator<Expression, GpuDevice> > <<<num_blocks, block_size, 0, device.stream()>>>(evaluator, size);
eigen_assert(cudaGetLastError() == cudaSuccess); assert(cudaGetLastError() == cudaSuccess);
}
evaluator.cleanup(); evaluator.cleanup();
} }
}; };