mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 02:33:59 +08:00
Turnon the new thread pool by default since it scales much better over multiple cores. It is still possible to revert to the old thread pool by compiling with the EIGEN_USE_SIMPLE_THREAD_POOL define.
This commit is contained in:
parent
97605c7b27
commit
83dfb40f66
@ -15,7 +15,7 @@
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
namespace internal {
|
||||
|
||||
template<typename LhsScalar, typename LhsMapper, typename Index>
|
||||
@ -54,7 +54,7 @@ struct packRhsAndKernelArg {
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#endif // EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType>
|
||||
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
|
||||
@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
TensorEvaluator(const XprType& op, const Device& device) :
|
||||
Base(op, device) {}
|
||||
|
||||
#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
|
||||
bool rhs_inner_dim_reordered, int Alignment>
|
||||
void evalProduct(Scalar* buffer) const {
|
||||
@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else // EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#else // EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
|
||||
void evalProduct(Scalar* buffer) const {
|
||||
@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#endif // EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
|
||||
TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
|
||||
bool shard_by_col, bool prepacked) const {
|
||||
|
@ -14,7 +14,7 @@ namespace Eigen {
|
||||
|
||||
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
|
||||
// thread pool later.
|
||||
#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
|
||||
#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
|
||||
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
|
||||
typedef NonBlockingThreadPool ThreadPool;
|
||||
#else
|
||||
|
@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
/*
|
||||
template <int NumPerThread, typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
|
||||
half* output, half2* scratch) {
|
||||
eigen_assert(blockDim.y == 1);
|
||||
eigen_assert(blockDim.z == 1);
|
||||
eigen_assert(gridDim.y == 1);
|
||||
eigen_assert(gridDim.z == 1);
|
||||
|
||||
const int unroll_times = 16;
|
||||
eigen_assert(NumPerThread % unroll_times == 0);
|
||||
eigen_assert(unroll_times % 2 == 0);
|
||||
|
||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
|
||||
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
|
||||
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
// Initialize the output values if they weren't initialized by the ReductionInitKernel
|
||||
if (gridDim.x == 1) {
|
||||
Index i = thread_id;
|
||||
for (; i < num_preserved_coeffs; i += 2*num_threads) {
|
||||
((half2*)output)[i] = reducer.initializePacket();
|
||||
}
|
||||
if (i + 1 < num_preserved_coeffs) {
|
||||
output[i] = reducer.initialize();
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
|
||||
const Index row = i / input_col_blocks;
|
||||
|
||||
if (row + 1 < num_preserved_coeffs) {
|
||||
const Index col_block = i % input_col_blocks;
|
||||
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
|
||||
|
||||
half2 reduced_val1 = reducer.initializePacket();
|
||||
half2 reduced_val2 = reducer.initializePacket();
|
||||
|
||||
for (Index j = 0; j < NumPerThread; j += unroll_times) {
|
||||
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
|
||||
if (last_col >= num_coeffs_to_reduce) {
|
||||
Index col = col_begin + blockDim.x * j;
|
||||
for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
|
||||
const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col);
|
||||
reducer.reduce(val, &reduced_val);
|
||||
// do the same for reduce val2 here
|
||||
}
|
||||
if (col < num_coeffs_to_reduce) {
|
||||
// Peel;
|
||||
const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1);
|
||||
const half2 val = __halves2half2(last, reducer.initialize());
|
||||
reducer.reducePacket(val, &reduced_val);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
// Faster version of the loop with no branches after unrolling.
|
||||
#pragma unroll
|
||||
for (int k = 0; k < unroll_times; ++k) {
|
||||
const Index col = col_begin + blockDim.x * (j + k);
|
||||
reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val);
|
||||
}
|
||||
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
if (row + 1 < num_preserved_coeffs) {
|
||||
atomicReduce(&(output[row]), reduced_val, reducer);
|
||||
}
|
||||
else {
|
||||
atomicReduce(scratch, reduced_val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
template <typename Self, typename Op>
|
||||
struct InnerReducer<Self, Op, GpuDevice> {
|
||||
// Unfortunately nvidia doesn't support well exotic types such as complex,
|
||||
|
Loading…
x
Reference in New Issue
Block a user