Turnon the new thread pool by default since it scales much better over multiple cores. It is still possible to revert to the old thread pool by compiling with the EIGEN_USE_SIMPLE_THREAD_POOL define.

This commit is contained in:
Benoit Steiner 2016-05-13 17:23:15 -07:00
parent 97605c7b27
commit 83dfb40f66
3 changed files with 94 additions and 6 deletions

View File

@ -15,7 +15,7 @@
namespace Eigen {
#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL
#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
namespace internal {
template<typename LhsScalar, typename LhsMapper, typename Index>
@ -54,7 +54,7 @@ struct packRhsAndKernelArg {
};
} // end namespace internal
#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
#endif // EIGEN_USE_SIMPLE_THREAD_POOL
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
return 0;
}
#else // EIGEN_USE_NONBLOCKING_THREAD_POOL
#else // EIGEN_USE_SIMPLE_THREAD_POOL
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
}
#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
#endif // EIGEN_USE_SIMPLE_THREAD_POOL
TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
bool shard_by_col, bool prepacked) const {

View File

@ -14,7 +14,7 @@ namespace Eigen {
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
// thread pool later.
#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
typedef NonBlockingThreadPool ThreadPool;
#else

View File

@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
}
}
#ifdef EIGEN_HAS_CUDA_FP16
/*
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
half* output, half2* scratch) {
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
eigen_assert(gridDim.z == 1);
const int unroll_times = 16;
eigen_assert(NumPerThread % unroll_times == 0);
eigen_assert(unroll_times % 2 == 0);
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (gridDim.x == 1) {
Index i = thread_id;
for (; i < num_preserved_coeffs; i += 2*num_threads) {
((half2*)output)[i] = reducer.initializePacket();
}
if (i + 1 < num_preserved_coeffs) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
const Index row = i / input_col_blocks;
if (row + 1 < num_preserved_coeffs) {
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
half2 reduced_val1 = reducer.initializePacket();
half2 reduced_val2 = reducer.initializePacket();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
Index col = col_begin + blockDim.x * j;
for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
// do the same for reduce val2 here
}
if (col < num_coeffs_to_reduce) {
// Peel;
const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1);
const half2 val = __halves2half2(last, reducer.initialize());
reducer.reducePacket(val, &reduced_val);
}
break;
} else {
// Faster version of the loop with no branches after unrolling.
#pragma unroll
for (int k = 0; k < unroll_times; ++k) {
const Index col = col_begin + blockDim.x * (j + k);
reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val);
}
}
}
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
if (row + 1 < num_preserved_coeffs) {
atomicReduce(&(output[row]), reduced_val, reducer);
}
else {
atomicReduce(scratch, reduced_val, reducer);
}
}
}
}
}
*/
#endif
template <typename Self, typename Op>
struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,