From 07ac4f7e027cddd3457a34295420480f7e541ac5 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 14 Apr 2016 18:28:23 -0700 Subject: [PATCH 1/2] Eigen Tensor cost model part 2: Thread scheduling for standard evaluators and reductions. The cost model is turned off by default. --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 5 + .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 91 ++++++++++----- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 104 +++++++----------- 4 files changed, 110 insertions(+), 96 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 32bc5d0b2..4e8f86674 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#if !defined(EIGEN_USE_GPU) -#define EIGEN_USE_COST_MODEL -#endif +//#if !defined(EIGEN_USE_GPU) +//#define EIGEN_USE_COST_MODEL +//#endif namespace Eigen { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f1f9a90df..293012646 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -189,6 +189,11 @@ struct TensorEvaluator return loadConstant(m_data+index); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + internal::unpacket_traits::size); + } + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } protected: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index eabfd91fe..df9cc0998 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -59,9 +59,16 @@ class TensorExecutor { const Index size = array_prod(evaluator.dimensions()); const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Manually unroll this loop since compilers don't do it. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } const Index VectorizedSize = (size / PacketSize) * PacketSize; - - for (Index i = 0; i < VectorizedSize; i += PacketSize) { + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } for (Index i = VectorizedSize; i < size; ++i) { @@ -78,8 +85,9 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } @@ -88,28 +96,45 @@ struct EvalRange { template struct EvalRange { - static void run(Evaluator evaluator, const Index first, const Index last) { - eigen_assert(last > first); + static void run(void* evaluator_in, const Index first, const Index last) { + Evaluator evaluator(*static_cast(evaluator_in)); + eigen_assert(last >= first); Index i = first; - static const int PacketSize = unpacket_traits::size; + const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); - Index lastPacket = last - (last % PacketSize); - for (; i < lastPacket; i += PacketSize) { + Index last_chunk_offset = last - 4 * PacketSize; + // Manually unroll this loop since compilers don't do it. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + evaluator.evalPacket(i); + evaluator.evalPacket(i+PacketSize); + evaluator.evalPacket(i+2*PacketSize); + evaluator.evalPacket(i+3*PacketSize); + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < last; ++i) { evaluator.evalScalar(i); } } }; -template -class TensorExecutor -{ +// Used to make an std::function to add to the ThreadPool with less templating +// than EvalRange::Run. +// This requires that this and EvalRange takes a void* to the evaluator that can +// be downcast to the right type by the EvalRange. +template +inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), + void* evaluator, const Index first, const Index last) { + run_fn(evaluator, first, last); +} + +template +class TensorExecutor { public: typedef typename Expression::Index Index; static inline void run(const Expression& expr, const ThreadPoolDevice& device) @@ -119,24 +144,35 @@ class TensorExecutor const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; const Index size = array_prod(evaluator.dimensions()); - - static const int PacketSize = Vectorizable ? unpacket_traits::size : 1; - - int blocksz = std::ceil(static_cast(size)/device.numThreads()) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const unsigned int numblocks = static_cast(size / blocksize); - - Barrier barrier(numblocks); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &EvalRange::run, evaluator, i*blocksize, (i+1)*blocksize); + int num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } +#endif + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; - if (static_cast(numblocks) * blocksize < size) { - EvalRange::run(evaluator, numblocks * blocksize, size); + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &InvokeEvalRange, + &EvalRange::run, + static_cast(&evaluator), i * blocksize, + (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run(&evaluator, numblocks * blocksize, size); + } + barrier.Wait(); } - - barrier.Wait(); } evaluator.cleanup(); } @@ -226,7 +262,6 @@ inline void TensorExecutor::run( #endif // __CUDACC__ #endif // EIGEN_USE_GPU - } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 1c9e7ab66..885295f0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -214,7 +214,7 @@ struct FullReducer { static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } }; @@ -222,18 +222,19 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer::reduce( + *output = InnerMostDimReducer::reduce( self, firstIndex, numValuesToReduce, reducer); } }; -template -struct FullReducer { +// Multithreaded full reducer +template +struct FullReducer { static const bool HasOptimizedImplementation = !Op::IsStateful; static const int PacketSize = unpacket_traits::size; @@ -247,79 +248,44 @@ struct FullReducer { *output = reducer.finalize(reducer.initialize()); return; } - const std::size_t num_threads = device.numThreads(); +#ifdef EIGEN_USE_COST_MODEL + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); +#else + const int num_threads = device.numThreads(); +#endif if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } else { - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); - - Barrier barrier(numblocks); - MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, - i * blocksize, blocksize, reducer, &shards[i]); - } - - typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } - } -}; - -template -struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); return; } - const std::size_t num_threads = device.numThreads(); - if (num_threads == 1) { - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } - const Index blocksize = std::floor(static_cast(num_coeffs) / num_threads); - const unsigned int numblocks = blocksize > 0 ? static_cast(num_coeffs / blocksize) : 0; - eigen_assert(num_coeffs >= static_cast(numblocks) * blocksize); + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); Barrier barrier(numblocks); MaxSizeVector shards(numblocks, reducer.initialize()); - for (unsigned int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, self, i * blocksize, blocksize, reducer, &shards[i]); } typename Self::CoeffReturnType finalShard; - if (static_cast(numblocks) * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer); + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); } else { finalShard = reducer.initialize(); } - barrier.Wait(); - for (unsigned int i = 0; i < numblocks; ++i) { + + for (Index i = 0; i < numblocks; ++i) { reducer.reduce(shards[i], &finalShard); } *output = reducer.finalize(finalShard); @@ -498,13 +464,21 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + static bool size_large_enough(Index total_size) { +#ifndef EIGEN_USE_COST_MODEL + return total_size > 1024 * 1024; +#else + return true || total_size; +#endif + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); // Use the FullReducer if possible. if (RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - (!RunningOnGPU && (internal::array_prod(m_impl.dimensions()) > 1024 * 1024)))) { + (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) { bool need_assign = false; if (!data) { From 3718bf654bd173ae05f396f5d0cff1a4e15ef72d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 15 Apr 2016 12:51:33 -0700 Subject: [PATCH 2/2] Get rid of void* casting when calling EvalRange::run. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 28 ++++++------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index df9cc0998..7a54f7a23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -85,8 +85,8 @@ class TensorExecutor #ifdef EIGEN_USE_THREADS template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); @@ -96,10 +96,9 @@ struct EvalRange { template struct EvalRange { - static void run(void* evaluator_in, const Index first, const Index last) { - Evaluator evaluator(*static_cast(evaluator_in)); + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; eigen_assert(last >= first); - Index i = first; const int PacketSize = unpacket_traits::size; if (last - first >= PacketSize) { @@ -123,16 +122,6 @@ struct EvalRange { } }; -// Used to make an std::function to add to the ThreadPool with less templating -// than EvalRange::Run. -// This requires that this and EvalRange takes a void* to the evaluator that can -// be downcast to the right type by the EvalRange. -template -inline void InvokeEvalRange(void (*run_fn)(void*, const Index, const Index), - void* evaluator, const Index first, const Index last) { - run_fn(evaluator, first, last); -} - template class TensorExecutor { public: @@ -163,13 +152,12 @@ class TensorExecutor { Barrier barrier(numblocks); for (int i = 0; i < numblocks; ++i) { device.enqueue_with_barrier( - &barrier, &InvokeEvalRange, - &EvalRange::run, - static_cast(&evaluator), i * blocksize, - (i + 1) * blocksize); + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); } if (numblocks * blocksize < size) { - EvalRange::run(&evaluator, numblocks * blocksize, size); + EvalRange::run( + &evaluator, numblocks * blocksize, size); } barrier.Wait(); }