From 7aa3557d31a5ccd82486de1e445c76b9b77a33a4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 12 May 2016 18:59:04 -0700 Subject: [PATCH 01/12] Fixed compilation errors triggered by old versions of gcc --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index c771496e2..781ced7b8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -118,7 +118,7 @@ struct TensorEvaluator, Device> // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { @@ -247,7 +247,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; @@ -299,7 +299,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index originalIndex = index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index f7a9f4ed3..1ba7ef170 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -152,7 +152,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { - EIGEN_STATIC_ASSERT(NumInputDims >= 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -202,7 +202,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || @@ -341,7 +341,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index de2f67d74..f391fb9ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -189,7 +189,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index bfa65a607..75d54759b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -409,7 +409,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); Index inputIndices[] = {0, 0}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 88b838b27..742339f9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector // of 1 element first and then pad. - EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); // Compute dimensions m_dimensions = m_impl.dimensions(); @@ -261,7 +261,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; @@ -318,7 +318,7 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); const Index initialIndex = index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index a87e45330..886a254f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -184,7 +184,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 52b7d216a..6c35bfdb6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -164,7 +164,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); Index inputIndices[] = {0, 0}; @@ -289,7 +289,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); Index inputIndices[] = {0, 0}; From c4fc8b70ecd453bec883f53ca17b94004f94528e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 May 2016 10:49:38 -0700 Subject: [PATCH 02/12] Removed unnecessary thread synchronization --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index b433a14c9..8c2baec14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -177,7 +177,7 @@ static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self } half2 accum = reducer.template initializePacket(); - Index max_iter = numext::mini((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); + const Index max_iter = numext::mini((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); for (Index i = 0; i < max_iter; i += BlockSize) { const Index index = first_index + 2*i; eigen_assert(index + 1 < num_coeffs); @@ -333,7 +333,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu for (Index j = 0; j < NumPerThread; j += unroll_times) { const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); if (last_col >= num_coeffs_to_reduce) { - for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); reducer.reduce(val, &reduced_val); } @@ -357,8 +357,6 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu atomicReduce(&(output[row]), reduced_val, reducer); } } - - __syncthreads(); } } From 069a0b04d76282f24f1941c3dac4a2629a0a57c1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 May 2016 14:32:17 -0700 Subject: [PATCH 03/12] Added benchmarks for contraction on CPU. --- bench/tensors/README | 9 +++-- bench/tensors/contraction_benchmarks_cpu.cc | 39 +++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 bench/tensors/contraction_benchmarks_cpu.cc diff --git a/bench/tensors/README b/bench/tensors/README index 4398aa81b..de1480317 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -1,4 +1,6 @@ -Each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. +The tensor benchmark suite is made of several parts. + +The first part is a generic suite, in which each benchmark comes in 2 flavors: one that runs on CPU, and one that runs on GPU. To compile the floating point CPU benchmarks, simply call: g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu @@ -6,7 +8,8 @@ g++ tensor_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG To compile the floating point GPU benchmarks, simply call: nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_35 -o benchmarks_gpu - -To compile the half float GPU benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. +We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -arch compute_53 -o benchmarks_fp16_gpu +last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu diff --git a/bench/tensors/contraction_benchmarks_cpu.cc b/bench/tensors/contraction_benchmarks_cpu.cc new file mode 100644 index 000000000..f9e57ad47 --- /dev/null +++ b/bench/tensors/contraction_benchmarks_cpu.cc @@ -0,0 +1,39 @@ +#define EIGEN_USE_THREADS + +#include + +#include "tensor_benchmarks.h" + +#define CREATE_THREAD_POOL(threads) \ +Eigen::ThreadPool pool(threads); \ +Eigen::ThreadPoolDevice device(&pool, threads); + + +// Contractions for number of threads ranging from 1 to 32 +// Dimensions are Rows, Cols, Depth +#define BM_ContractionCPU(D1, D2, D3) \ + static void BM_##Contraction##_##D1##x##D2##x##D3(int iters, int Threads) { \ + StopBenchmarkTiming(); \ + CREATE_THREAD_POOL(Threads); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.contraction(iters); \ + } \ + BENCHMARK_RANGE(BM_##Contraction##_##D1##x##D2##x##D3, 1, 32); + + +// Vector Matrix and Matrix Vector products +BM_ContractionCPU(1, 2000, 500); +BM_ContractionCPU(2000, 1, 500); + +// Various skinny matrices +BM_ContractionCPU(250, 3, 512); +BM_ContractionCPU(1500, 3, 512); + +BM_ContractionCPU(512, 800, 4); +BM_ContractionCPU(512, 80, 800); +BM_ContractionCPU(512, 80, 13522); +BM_ContractionCPU(1, 80, 13522); + +BM_ContractionCPU(3200, 512, 4); +BM_ContractionCPU(3200, 512, 80); +BM_ContractionCPU(3200, 80, 512); From 97605c7b27b389de597bcbc9153fedf5dff0c851 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 May 2016 17:11:29 -0700 Subject: [PATCH 04/12] New multithreaded contraction that doesn't rely on the thread pool to run the closure in the order in which they are enqueued. This is needed in order to switch to the new non blocking thread pool since this new thread pool can execute the closure in any order. --- .../src/Tensor/TensorContractionThreadPool.h | 666 +++++++++++++++++- 1 file changed, 665 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 300c37180..b33ab962e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -14,6 +14,8 @@ #ifdef EIGEN_USE_THREADS namespace Eigen { + +#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL namespace internal { template @@ -52,7 +54,7 @@ struct packRhsAndKernelArg { }; } // end namespace internal - +#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL template struct TensorEvaluator, ThreadPoolDevice> : @@ -110,6 +112,627 @@ struct TensorEvaluator + void evalProduct(Scalar* buffer) const { + typedef + typename internal::remove_const::type + LhsScalar; + typedef + typename internal::remove_const::type + RhsScalar; + typedef typename internal::gebp_traits Traits; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::blas_data_mapper OutputMapper; + typedef internal::gemm_pack_lhs + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::gebp_kernel + GebpKernel; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = + contractionCost(m, n, bm, bn, bk, shard_by_col, false); + Index num_threads = TensorCostModel::numThreads( + static_cast(n) * m, cost, this->m_device.numThreads()); + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + // The single-threaded algorithm should be faster in this case. + if (n == 1) + this->template evalGemv(buffer); + else + this->template evalGemm(buffer); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * sizeof(LhsScalar) + n * bk * sizeof(RhsScalar) <= + l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, + this->m_i_strides, this->m_left_contracting_strides, + this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, + this->m_j_strides, this->m_right_contracting_strides, + this->m_k_strides); + + Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, + k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, + shard_by_col, parallel_pack) + .run(); + } + + // Context coordinates a single parallel gemm operation. + template + class Context { + public: + Context(const Device& device, int num_threads, LhsMapper& lhs, + RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack) + : device_(device), + lhs_(lhs), + rhs_(rhs), + buffer_(buffer), + output_(buffer, m), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + m_(m), + n_(n), + k_(k), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0) + { + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceeding kernels. + state_switch_[x] = + x == 0 + ? 1 + : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = + parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store( + (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), + std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + size_t lhs_size = + divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; + size_t rhs_size = + divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; + packed_mem_ = static_cast(internal::aligned_malloc( + (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); + char* mem = static_cast(packed_mem_); + for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { + packed_lhs_[x].resize(nm0_); + for (Index m = 0; m < nm0_; m++) { + packed_lhs_[x][m] = reinterpret_cast(mem); + mem += lhs_size; + } + packed_rhs_[x].resize(nn0_); + for (Index n = 0; n < nn0_; n++) { + packed_rhs_[x][n] = reinterpret_cast(mem); + mem += rhs_size; + } + } + } + + ~Context() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + internal::aligned_free(packed_mem_); + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + // Wait for overall completion. + // TODO(dvyukov): this wait can lead to deadlock. + // If nthreads contractions are concurrently submitted from worker + // threads, this wait will block all worker threads and the system will + // deadlock. + done_.Wait(); + } + + private: + Notification done_; + const Device& device_; + LhsMapper& lhs_; + RhsMapper& rhs_; + Scalar* const buffer_; + OutputMapper output_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static const Index P = 3; + void* packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + void pack_lhs(Index m, Index k) { + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + LhsPacker()(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); + } + } + + void pack_rhs(Index n, Index k) { + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (k == 0) { + // Zero the output memory in parallel. + // On 10000x2x10000 mm zeroing can easily take half of time. + // Zero (bn x m) row. Safe to do here because all kernels that will + // write to this memory depend on completion of this task. + // Note: don't call device_.memset() here. device_.memset() blocks on + // thread pool worker thread, which can lead to underutilization and + // deadlocks. + memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); + } + RhsPacker()(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); + } else { + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consequetive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); + } + } + signal_kernel(m, n, k + 1, false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) return; + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) + kernel(m, n, k); + else + device_.enqueueNoNotification([=]() { kernel(m, n, k); }); + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = + (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + + nm_ * nn_; + if (k < nk_) { + // It is important to copy out nm_ and nn_, because once we kick off + // the last packing operation this and device_ can be destroyed. + Index nm = nm_; + Index nn = nn_; + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, + parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { + enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); + } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, mid, k, rhs); }); + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + Context(const Context&) = delete; + void operator=(const Context&) = delete; + }; + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, int num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && + (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, + int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = divup(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nm1 = divup(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = divup(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nn1 = divup(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + Index gn, Index oldgm, Index oldgn, int num_threads, + bool shard_by_col) const { + const TensorOpCost cost = + contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize( + static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); + double new_parallelism = static_cast(new_tasks) / + (divup(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + double old_parallelism = static_cast(old_tasks) / + (divup(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + +#else // EIGEN_USE_NONBLOCKING_THREAD_POOL + template void evalProduct(Scalar* buffer) const { if (this->m_j_size == 1) { @@ -384,6 +1007,47 @@ struct TensorEvaluator(PacketType::size, + PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 : + (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } }; } // end namespace Eigen From 83dfb40f66e15c5a0c6af2d3c88357d65b76770d Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 13 May 2016 17:23:15 -0700 Subject: [PATCH 05/12] Turnon the new thread pool by default since it scales much better over multiple cores. It is still possible to revert to the old thread pool by compiling with the EIGEN_USE_SIMPLE_THREAD_POOL define. --- .../src/Tensor/TensorContractionThreadPool.h | 10 +-- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 2 +- .../CXX11/src/Tensor/TensorReductionCuda.h | 88 +++++++++++++++++++ 3 files changed, 94 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index b33ab962e..88d485f38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,7 +15,7 @@ namespace Eigen { -#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL namespace internal { template @@ -54,7 +54,7 @@ struct packRhsAndKernelArg { }; } // end namespace internal -#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL +#endif // EIGEN_USE_SIMPLE_THREAD_POOL template struct TensorEvaluator, ThreadPoolDevice> : @@ -112,7 +112,7 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { @@ -731,7 +731,7 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { @@ -1007,7 +1007,7 @@ struct TensorEvaluator using ThreadPoolTempl = NonBlockingThreadPoolTempl; typedef NonBlockingThreadPool ThreadPool; #else diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 8c2baec14..63646dfc2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } +#ifdef EIGEN_HAS_CUDA_FP16 +/* +template +__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + half* output, half2* scratch) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = thread_id; + for (; i < num_preserved_coeffs; i += 2*num_threads) { + ((half2*)output)[i] = reducer.initializePacket(); + } + if (i + 1 < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + half2 reduced_val1 = reducer.initializePacket(); + half2 reduced_val2 = reducer.initializePacket(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) { + const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + // do the same for reduce val2 here + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1); + const half2 val = __halves2half2(last, reducer.initialize()); + reducer.reducePacket(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + if (row + 1 < num_preserved_coeffs) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + else { + atomicReduce(scratch, reduced_val, reducer); + } + } + } + } +} +*/ +#endif + template struct InnerReducer { // Unfortunately nvidia doesn't support well exotic types such as complex, From b789a26804e48c03e8cac3aae70d09557b6d8d8b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 May 2016 08:51:08 -0700 Subject: [PATCH 06/12] Fixed syntax error --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 781ced7b8..9ea1c78eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -354,11 +354,11 @@ struct TensorEvaluator, Device> if (NumDims > 0) { for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); - if (internal::index_statically_eq()(i, 1)) { + if (internal::index_statically_eq(i, 1)) { compute_cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); } else { - if (!internal::index_statically_eq()(i, 1)) { + if (!internal::index_statically_eq(i, 1)) { compute_cost += TensorOpCost::MulCost() + TensorOpCost::ModCost() + TensorOpCost::AddCost(); From 83ef39e055dda0d6c1b0c84924a0733a68577aa3 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 May 2016 08:55:21 -0700 Subject: [PATCH 07/12] Turn on the cost model by default. This results in some significant speedups for smaller tensors. For example, below are the results for the various tensor reductions. Before: BM_colReduction_12T/10 1000000 1949 51.29 MFlops/s BM_colReduction_12T/80 100000 15636 409.29 MFlops/s BM_colReduction_12T/640 20000 95100 4307.01 MFlops/s BM_colReduction_12T/4K 500 4573423 5466.36 MFlops/s BM_colReduction_4T/10 1000000 1867 53.56 MFlops/s BM_colReduction_4T/80 500000 5288 1210.11 MFlops/s BM_colReduction_4T/640 10000 106924 3830.75 MFlops/s BM_colReduction_4T/4K 500 9946374 2513.48 MFlops/s BM_colReduction_8T/10 1000000 1912 52.30 MFlops/s BM_colReduction_8T/80 200000 8354 766.09 MFlops/s BM_colReduction_8T/640 20000 85063 4815.22 MFlops/s BM_colReduction_8T/4K 500 5445216 4591.19 MFlops/s BM_rowReduction_12T/10 1000000 2041 48.99 MFlops/s BM_rowReduction_12T/80 100000 15426 414.87 MFlops/s BM_rowReduction_12T/640 50000 39117 10470.98 MFlops/s BM_rowReduction_12T/4K 500 3034298 8239.14 MFlops/s BM_rowReduction_4T/10 1000000 1834 54.51 MFlops/s BM_rowReduction_4T/80 500000 5406 1183.81 MFlops/s BM_rowReduction_4T/640 50000 35017 11697.16 MFlops/s BM_rowReduction_4T/4K 500 3428527 7291.76 MFlops/s BM_rowReduction_8T/10 1000000 1925 51.95 MFlops/s BM_rowReduction_8T/80 200000 8519 751.23 MFlops/s BM_rowReduction_8T/640 50000 33441 12248.42 MFlops/s BM_rowReduction_8T/4K 1000 2852841 8763.19 MFlops/s After: BM_colReduction_12T/10 50000000 59 1678.30 MFlops/s BM_colReduction_12T/80 5000000 725 8822.71 MFlops/s BM_colReduction_12T/640 20000 90882 4506.93 MFlops/s BM_colReduction_12T/4K 500 4668855 5354.63 MFlops/s BM_colReduction_4T/10 50000000 59 1687.37 MFlops/s BM_colReduction_4T/80 5000000 737 8681.24 MFlops/s BM_colReduction_4T/640 50000 108637 3770.34 MFlops/s BM_colReduction_4T/4K 500 7912954 3159.38 MFlops/s BM_colReduction_8T/10 50000000 60 1657.21 MFlops/s BM_colReduction_8T/80 5000000 726 8812.48 MFlops/s BM_colReduction_8T/640 20000 91451 4478.90 MFlops/s BM_colReduction_8T/4K 500 5441692 4594.16 MFlops/s BM_rowReduction_12T/10 20000000 93 1065.28 MFlops/s BM_rowReduction_12T/80 2000000 950 6730.96 MFlops/s BM_rowReduction_12T/640 50000 38196 10723.48 MFlops/s BM_rowReduction_12T/4K 500 3019217 8280.29 MFlops/s BM_rowReduction_4T/10 20000000 93 1064.30 MFlops/s BM_rowReduction_4T/80 2000000 959 6667.71 MFlops/s BM_rowReduction_4T/640 50000 37433 10941.96 MFlops/s BM_rowReduction_4T/4K 500 3036476 8233.23 MFlops/s BM_rowReduction_8T/10 20000000 93 1072.47 MFlops/s BM_rowReduction_8T/80 2000000 959 6670.04 MFlops/s BM_rowReduction_8T/640 50000 38069 10759.37 MFlops/s BM_rowReduction_8T/4K 1000 2758988 9061.29 MFlops/s --- unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 4cb37a651..cb6fb4626 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -10,9 +10,8 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -//#if !defined(EIGEN_USE_GPU) -//#define EIGEN_USE_COST_MODEL -//#endif +// Turn on the cost model by default +#define EIGEN_USE_COST_MODEL namespace Eigen { From a80d875916de350c1849cd97d8b2515f620911d4 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 16 May 2016 09:31:10 -0700 Subject: [PATCH 08/12] Added missing costPerCoeff method --- unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index babafe108..d06f40cd8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -254,6 +254,14 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = 1.0 + + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + private: EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { if (m_return_dim < 0) { From 2d74ef968261834031823716a239ecaafddd08e1 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 17 May 2016 07:20:11 -0700 Subject: [PATCH 09/12] Avoid float to double conversion --- test/array.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/array.cpp b/test/array.cpp index beaa62221..1f4afc1c6 100644 --- a/test/array.cpp +++ b/test/array.cpp @@ -525,7 +525,7 @@ template void array_complex(const ArrayType& m) // scalar by array division Scalar s1 = internal::random(); - const RealScalar tiny = sqrt(std::numeric_limits::epsilon()); + const RealScalar tiny = std::sqrt(std::numeric_limits::epsilon()); s1 += Scalar(tiny); m1 += ArrayType::Constant(rows,cols,Scalar(tiny)); VERIFY_IS_APPROX(s1/m1, s1 * m1.inverse()); From 92fc6add43655320120b4baad304997e0b06aa96 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 17 May 2016 07:21:22 -0700 Subject: [PATCH 10/12] Don't rely on c++11 extension when we don't have to. --- unsupported/test/cxx11_tensor_expr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 4dd355e6e..77e24cb67 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -16,8 +16,8 @@ using Eigen::RowMajor; static void test_1d() { - Tensor vec1({6}); - Tensor vec2({6}); + Tensor vec1(6); + Tensor vec2(6); vec1(0) = 4.0; vec2(0) = 0.0; vec1(1) = 8.0; vec2(1) = 1.0; From 5fa27574dd29fc5753a0b9d39b1fe5c15668e658 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 17 May 2016 09:17:26 -0700 Subject: [PATCH 11/12] Allow vectorized padding on GPU. This helps speed things up a little Before: BM_padding/10 5000000 460 217.03 MFlops/s BM_padding/80 5000000 460 13899.40 MFlops/s BM_padding/640 5000000 461 888421.17 MFlops/s BM_padding/4K 5000000 460 54316322.55 MFlops/s After: BM_padding/10 5000000 454 220.20 MFlops/s BM_padding/80 5000000 455 14039.86 MFlops/s BM_padding/640 5000000 452 904968.83 MFlops/s BM_padding/4K 5000000 411 60750049.21 MFlops/s --- unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 742339f9e..266eea0a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -93,7 +93,7 @@ struct TensorEvaluator, Device static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = true, From e7e64c327785ceffd9da018ee265d761991f9685 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 17 May 2016 09:24:35 -0700 Subject: [PATCH 12/12] Enable the use of the packet api to evaluate tensor broadcasts. This speed things up quite a bit: Before" M_broadcasting/10 500000 3690 27.10 MFlops/s BM_broadcasting/80 500000 4014 1594.24 MFlops/s BM_broadcasting/640 100000 14770 27731.35 MFlops/s BM_broadcasting/4K 5000 632711 39512.48 MFlops/s After: BM_broadcasting/10 500000 4287 23.33 MFlops/s BM_broadcasting/80 500000 4455 1436.41 MFlops/s BM_broadcasting/640 200000 10195 40173.01 MFlops/s BM_broadcasting/4K 5000 423746 58997.57 MFlops/s --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 9ea1c78eb..5d67f69f3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -106,7 +106,7 @@ struct TensorEvaluator, Device> static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, + IsAligned = true, PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, RawAccess = false