mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-22 20:47:35 +08:00
Doubled the speed of full reductions on GPUs.
This commit is contained in:
parent
3abd8470ca
commit
75a7fa1919
@ -87,15 +87,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
}
|
}
|
||||||
|
|
||||||
typename Self::CoeffReturnType accum = reducer.initialize();
|
typename Self::CoeffReturnType accum = reducer.initialize();
|
||||||
for (Index i = 0; i < NumPerThread; ++i) {
|
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
|
||||||
const Index index = first_index + i * BlockSize;
|
for (Index i = 0; i < max_iter; i+=BlockSize) {
|
||||||
if (index >= num_coeffs) {
|
const Index index = first_index + i;
|
||||||
break;
|
eigen_assert(index < num_coeffs);
|
||||||
}
|
|
||||||
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
|
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
|
||||||
reducer.reduce(val, &accum);
|
reducer.reduce(val, &accum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||||
reducer.reduce(__shfl_down(accum, offset), &accum);
|
reducer.reduce(__shfl_down(accum, offset), &accum);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user