mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 12:46:00 +08:00
Optimized the performance of narrow reductions on CUDA devices
This commit is contained in:
parent
e9bea614ec
commit
3284842045
@ -544,8 +544,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||||
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
||||||
Op reducer(m_reducer);
|
Op reducer(m_reducer);
|
||||||
internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
|
return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool preserving_inner_dims = true;
|
bool preserving_inner_dims = true;
|
||||||
@ -561,8 +560,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
|
|||||||
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
|
||||||
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
|
||||||
Op reducer(m_reducer);
|
Op reducer(m_reducer);
|
||||||
internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
|
return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -230,9 +230,14 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
|||||||
assert(false && "Should only be called to reduce floats on a gpu device");
|
assert(false && "Should only be called to reduce floats on a gpu device");
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
|
|
||||||
|
// It's faster to use the usual code.
|
||||||
|
if (num_coeffs_to_reduce <= 32) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
@ -255,6 +260,8 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
|||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -301,9 +308,14 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
assert(false && "Should only be called to reduce floats on a gpu device");
|
assert(false && "Should only be called to reduce floats on a gpu device");
|
||||||
}
|
}
|
||||||
|
|
||||||
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
static EIGEN_DEVICE_FUNC bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
|
|
||||||
|
// It's faster to use the usual code.
|
||||||
|
if (num_coeffs_to_reduce <= 32) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 16;
|
const int num_per_thread = 16;
|
||||||
@ -326,6 +338,8 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
|
|
||||||
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
|
||||||
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user