mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-23 01:59:38 +08:00
Consolidate multiple implementations of divup/div_up/div_ceil.
This commit is contained in:
parent
e8515f78ac
commit
a96545777b
@ -1341,6 +1341,19 @@ double ceil(const double &x) { return ::ceil(x); }
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Integer division with rounding up.
|
||||||
|
// T is assumed to be an integer type with a>=0, and b>0
|
||||||
|
template<typename T>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR
|
||||||
|
T div_ceil(const T &a, const T &b)
|
||||||
|
{
|
||||||
|
EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
|
||||||
|
eigen_assert(a >= 0);
|
||||||
|
eigen_assert(b > 0);
|
||||||
|
// Note: This form is used because it cannot overflow.
|
||||||
|
return a == 0 ? 0 : (a - 1) / b + 1;
|
||||||
|
}
|
||||||
|
|
||||||
/** Log base 2 for 32 bits positive integers.
|
/** Log base 2 for 32 bits positive integers.
|
||||||
* Conveniently returns 0 for x==0. */
|
* Conveniently returns 0 for x==0. */
|
||||||
inline int log2(int x)
|
inline int log2(int x)
|
||||||
|
@ -90,8 +90,6 @@ class gemm_class {
|
|||||||
const Index a_stride, b_stride;
|
const Index a_stride, b_stride;
|
||||||
const Index a_off, b_off;
|
const Index a_off, b_off;
|
||||||
|
|
||||||
static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; }
|
|
||||||
|
|
||||||
EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
|
EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
|
||||||
_mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
|
_mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
|
||||||
}
|
}
|
||||||
@ -479,7 +477,7 @@ class gemm_class {
|
|||||||
*
|
*
|
||||||
* const Scalar *cox = (idx == 0) ? co1 : co2;
|
* const Scalar *cox = (idx == 0) ? co1 : co2;
|
||||||
*
|
*
|
||||||
* const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
|
* const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
|
||||||
* scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
|
* scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
|
||||||
* write_c<0, um_vecs, idx, a_unroll>(cox);
|
* write_c<0, um_vecs, idx, a_unroll>(cox);
|
||||||
*
|
*
|
||||||
@ -498,7 +496,7 @@ class gemm_class {
|
|||||||
EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
|
EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
|
||||||
if (pow >= 4) cox += ldc;
|
if (pow >= 4) cox += ldc;
|
||||||
|
|
||||||
const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
|
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
|
||||||
auto &alpha_reg = zmm[alpha_load_reg];
|
auto &alpha_reg = zmm[alpha_load_reg];
|
||||||
|
|
||||||
scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
|
scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
|
||||||
@ -644,7 +642,7 @@ class gemm_class {
|
|||||||
template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
|
template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
|
||||||
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
|
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
|
||||||
Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
|
Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
|
||||||
const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
|
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
|
||||||
|
|
||||||
if (max_b_unroll >= 1)
|
if (max_b_unroll >= 1)
|
||||||
innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
|
innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
|
||||||
@ -729,7 +727,7 @@ class gemm_class {
|
|||||||
|
|
||||||
template <int a_unroll, int b_unroll, int max_b_unroll>
|
template <int a_unroll, int b_unroll, int max_b_unroll>
|
||||||
EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
|
EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
|
||||||
const int um_vecs = div_up(a_unroll, nelems_in_cache_line);
|
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
|
||||||
if (!use_less_a_regs && k > 1)
|
if (!use_less_a_regs && k > 1)
|
||||||
a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
|
a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
|
||||||
else
|
else
|
||||||
|
@ -422,15 +422,6 @@ template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b);
|
|||||||
|
|
||||||
using std::numeric_limits;
|
using std::numeric_limits;
|
||||||
|
|
||||||
// Integer division with rounding up.
|
|
||||||
// T is assumed to be an integer type with a>=0, and b>0
|
|
||||||
template<typename T>
|
|
||||||
EIGEN_DEVICE_FUNC
|
|
||||||
T div_ceil(const T &a, const T &b)
|
|
||||||
{
|
|
||||||
return (a+b-1) / b;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle integer comparisons of different signedness.
|
// Handle integer comparisons of different signedness.
|
||||||
template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
|
template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
|
||||||
bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
|
bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
|
||||||
|
@ -443,7 +443,7 @@ class TensorBlockMapper {
|
|||||||
const int dim = isColMajor ? i : NumDims - i - 1;
|
const int dim = isColMajor ? i : NumDims - i - 1;
|
||||||
m_block_dimensions[dim] =
|
m_block_dimensions[dim] =
|
||||||
numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
|
numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
|
||||||
coeff_to_allocate = divup(
|
coeff_to_allocate = numext::div_ceil(
|
||||||
coeff_to_allocate,
|
coeff_to_allocate,
|
||||||
numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
|
numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
|
||||||
}
|
}
|
||||||
@ -474,7 +474,7 @@ class TensorBlockMapper {
|
|||||||
const IndexType total_size_other_dims =
|
const IndexType total_size_other_dims =
|
||||||
total_size / m_block_dimensions[dim];
|
total_size / m_block_dimensions[dim];
|
||||||
const IndexType alloc_avail =
|
const IndexType alloc_avail =
|
||||||
divup<IndexType>(target_block_size, total_size_other_dims);
|
numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
|
||||||
if (alloc_avail == m_block_dimensions[dim]) {
|
if (alloc_avail == m_block_dimensions[dim]) {
|
||||||
// Insufficient excess coefficients to allocate.
|
// Insufficient excess coefficients to allocate.
|
||||||
break;
|
break;
|
||||||
@ -496,7 +496,7 @@ class TensorBlockMapper {
|
|||||||
// Calculate block counts by dimension and total block count.
|
// Calculate block counts by dimension and total block count.
|
||||||
DSizes<IndexType, NumDims> block_count;
|
DSizes<IndexType, NumDims> block_count;
|
||||||
for (int i = 0; i < NumDims; ++i) {
|
for (int i = 0; i < NumDims; ++i) {
|
||||||
block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
|
block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
|
||||||
}
|
}
|
||||||
m_total_block_count = array_prod(block_count);
|
m_total_block_count = array_prod(block_count);
|
||||||
|
|
||||||
|
@ -898,7 +898,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
// First multiple after a. This is b when <= bcast_dim_left_index +
|
// First multiple after a. This is b when <= bcast_dim_left_index +
|
||||||
// bcast_dim_size.
|
// bcast_dim_size.
|
||||||
const Index first_multiple =
|
const Index first_multiple =
|
||||||
divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
|
numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) *
|
||||||
input_bcast_dim_size;
|
input_bcast_dim_size;
|
||||||
|
|
||||||
if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
|
if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
|
||||||
|
@ -144,8 +144,8 @@ struct TensorContractionBlockMemAllocator {
|
|||||||
const Index bn) {
|
const Index bn) {
|
||||||
Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
||||||
BlockSizes sz;
|
BlockSizes sz;
|
||||||
sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
|
sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align;
|
||||||
sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
|
sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align;
|
||||||
return sz;
|
return sz;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -206,9 +206,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Number of kernels for each dimension.
|
// Number of kernels for each dimension.
|
||||||
Index nm0 = divup(m, bm);
|
Index nm0 = numext::div_ceil(m, bm);
|
||||||
Index nn0 = divup(n, bn);
|
Index nn0 = numext::div_ceil(n, bn);
|
||||||
Index nk = divup(k, bk);
|
Index nk = numext::div_ceil(k, bk);
|
||||||
|
|
||||||
// Calculate task grain size (number of kernels executed per task).
|
// Calculate task grain size (number of kernels executed per task).
|
||||||
// This task size coarsening serves two purposes:
|
// This task size coarsening serves two purposes:
|
||||||
@ -226,8 +226,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
|
gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
|
||||||
}
|
}
|
||||||
// Number of tasks in each dimension.
|
// Number of tasks in each dimension.
|
||||||
Index nm = divup(nm0, gm);
|
Index nm = numext::div_ceil(nm0, gm);
|
||||||
Index nn = divup(nn0, gn);
|
Index nn = numext::div_ceil(nn0, gn);
|
||||||
|
|
||||||
// If there is enough concurrency in the sharding dimension, we choose not
|
// If there is enough concurrency in the sharding dimension, we choose not
|
||||||
// to paralellize by the other dimension, and execute all kernels in sync
|
// to paralellize by the other dimension, and execute all kernels in sync
|
||||||
@ -1130,9 +1130,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
done(std::move(done_callback)),
|
done(std::move(done_callback)),
|
||||||
buffer_size_bytes(m * n * sizeof(Scalar)),
|
buffer_size_bytes(m * n * sizeof(Scalar)),
|
||||||
block_size(blockSize(k, num_threads)),
|
block_size(blockSize(k, num_threads)),
|
||||||
num_blocks(divup<Index>(k, block_size)),
|
num_blocks(numext::div_ceil<Index>(k, block_size)),
|
||||||
num_pending_blocks(internal::convert_index<int>(num_blocks)),
|
num_pending_blocks(internal::convert_index<int>(num_blocks)),
|
||||||
l0_ranges(divup<Index>(num_blocks, l0_size)),
|
l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)),
|
||||||
l0_state(l0_ranges),
|
l0_state(l0_ranges),
|
||||||
block_buffers(num_blocks) {
|
block_buffers(num_blocks) {
|
||||||
// Keep count of pending gemm tasks for each l0 range.
|
// Keep count of pending gemm tasks for each l0 range.
|
||||||
@ -1434,10 +1434,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
static Index blockSize(Index k, int num_threads) {
|
static Index blockSize(Index k, int num_threads) {
|
||||||
const auto round_up = [=](Index index) -> Index {
|
const auto round_up = [=](Index index) -> Index {
|
||||||
const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
|
const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
|
||||||
return divup<Index>(index, kmultiple) * kmultiple;
|
return numext::div_ceil<Index>(index, kmultiple) * kmultiple;
|
||||||
};
|
};
|
||||||
|
|
||||||
const Index target_block_size = round_up(divup<Index>(k, num_threads));
|
const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads));
|
||||||
const Index desired_min_block_size = 12 * packet_size;
|
const Index desired_min_block_size = 12 * packet_size;
|
||||||
|
|
||||||
return numext::mini<Index>(
|
return numext::mini<Index>(
|
||||||
@ -1485,19 +1485,19 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
int num_threads, bool shard_by_col) const {
|
int num_threads, bool shard_by_col) const {
|
||||||
Index gm = 1;
|
Index gm = 1;
|
||||||
Index gm1 = 1;
|
Index gm1 = 1;
|
||||||
Index nm0 = divup(m, bm);
|
Index nm0 = numext::div_ceil(m, bm);
|
||||||
Index nm1 = nm0;
|
Index nm1 = nm0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
// Find the next candidate for m grain size. It needs to result in
|
// Find the next candidate for m grain size. It needs to result in
|
||||||
// different number of blocks. E.g. if we have 10 kernels, we want to try
|
// different number of blocks. E.g. if we have 10 kernels, we want to try
|
||||||
// 5 and 10, but not 6, 7, 8 and 9.
|
// 5 and 10, but not 6, 7, 8 and 9.
|
||||||
while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
|
while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++;
|
||||||
if (gm1 > nm0) break;
|
if (gm1 > nm0) break;
|
||||||
// Check the candidate.
|
// Check the candidate.
|
||||||
int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
|
int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
|
||||||
shard_by_col);
|
shard_by_col);
|
||||||
if (res < 0) break;
|
if (res < 0) break;
|
||||||
nm1 = divup(nm0, gm1);
|
nm1 = numext::div_ceil(nm0, gm1);
|
||||||
if (res == 0) continue;
|
if (res == 0) continue;
|
||||||
// Commit new grain size.
|
// Commit new grain size.
|
||||||
gm = gm1;
|
gm = gm1;
|
||||||
@ -1509,15 +1509,15 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
int num_threads, bool shard_by_col) const {
|
int num_threads, bool shard_by_col) const {
|
||||||
Index gn = 1;
|
Index gn = 1;
|
||||||
Index gn1 = 1;
|
Index gn1 = 1;
|
||||||
Index nn0 = divup(n, bn);
|
Index nn0 = numext::div_ceil(n, bn);
|
||||||
Index nn1 = nn0;
|
Index nn1 = nn0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
|
while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++;
|
||||||
if (gn1 > nn0) break;
|
if (gn1 > nn0) break;
|
||||||
int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
|
int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
|
||||||
shard_by_col);
|
shard_by_col);
|
||||||
if (res < 0) break;
|
if (res < 0) break;
|
||||||
nn1 = divup(nn0, gn1);
|
nn1 = numext::div_ceil(nn0, gn1);
|
||||||
if (res == 0) continue;
|
if (res == 0) continue;
|
||||||
gn = gn1;
|
gn = gn1;
|
||||||
}
|
}
|
||||||
@ -1544,14 +1544,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
|||||||
// But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
|
// But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
|
||||||
// of cores will be busy). While grain size 3 gives us 4 tasks, which gives
|
// of cores will be busy). While grain size 3 gives us 4 tasks, which gives
|
||||||
// us parallelism of 1 (we can load all cores).
|
// us parallelism of 1 (we can load all cores).
|
||||||
Index nm0 = divup(m, bm);
|
Index nm0 = numext::div_ceil(m, bm);
|
||||||
Index nn0 = divup(n, bn);
|
Index nn0 = numext::div_ceil(n, bn);
|
||||||
Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
|
Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
|
||||||
double new_parallelism = static_cast<double>(new_tasks) /
|
double new_parallelism = static_cast<double>(new_tasks) /
|
||||||
(divup<int>(new_tasks, num_threads) * num_threads);
|
(numext::div_ceil<int>(new_tasks, num_threads) * num_threads);
|
||||||
Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
|
Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
|
||||||
double old_parallelism = static_cast<double>(old_tasks) /
|
double old_parallelism = static_cast<double>(old_tasks) /
|
||||||
(divup<int>(old_tasks, num_threads) * num_threads);
|
(numext::div_ceil<int>(old_tasks, num_threads) * num_threads);
|
||||||
if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
|
if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -223,7 +223,7 @@ struct ThreadPoolDevice {
|
|||||||
Index lastIdx) {
|
Index lastIdx) {
|
||||||
while (lastIdx - firstIdx > block.size) {
|
while (lastIdx - firstIdx > block.size) {
|
||||||
// Split into halves and schedule the second half on a different thread.
|
// Split into halves and schedule the second half on a different thread.
|
||||||
const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
|
const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
|
||||||
pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
|
pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
|
||||||
lastIdx = midIdx;
|
lastIdx = midIdx;
|
||||||
}
|
}
|
||||||
@ -282,7 +282,7 @@ struct ThreadPoolDevice {
|
|||||||
ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
|
ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
|
||||||
while (lastIdx - firstIdx > block.size) {
|
while (lastIdx - firstIdx > block.size) {
|
||||||
// Split into halves and schedule the second half on a different thread.
|
// Split into halves and schedule the second half on a different thread.
|
||||||
const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
|
const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
|
||||||
pool_->Schedule(
|
pool_->Schedule(
|
||||||
[ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
|
[ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
|
||||||
lastIdx = midIdx;
|
lastIdx = midIdx;
|
||||||
@ -357,7 +357,7 @@ struct ThreadPoolDevice {
|
|||||||
const Index max_oversharding_factor = 4;
|
const Index max_oversharding_factor = 4;
|
||||||
Index block_size = numext::mini(
|
Index block_size = numext::mini(
|
||||||
n, numext::maxi<Index>(
|
n, numext::maxi<Index>(
|
||||||
divup<Index>(n, max_oversharding_factor * numThreads()),
|
numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()),
|
||||||
block_size_f));
|
block_size_f));
|
||||||
const Index max_block_size = numext::mini(n, 2 * block_size);
|
const Index max_block_size = numext::mini(n, 2 * block_size);
|
||||||
|
|
||||||
@ -367,13 +367,13 @@ struct ThreadPoolDevice {
|
|||||||
block_size = numext::mini(n, new_block_size);
|
block_size = numext::mini(n, new_block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
Index block_count = divup(n, block_size);
|
Index block_count = numext::div_ceil(n, block_size);
|
||||||
|
|
||||||
// Calculate parallel efficiency as fraction of total CPU time used for
|
// Calculate parallel efficiency as fraction of total CPU time used for
|
||||||
// computations:
|
// computations:
|
||||||
double max_efficiency =
|
double max_efficiency =
|
||||||
static_cast<double>(block_count) /
|
static_cast<double>(block_count) /
|
||||||
(divup<int>(block_count, numThreads()) * numThreads());
|
(numext::div_ceil<int>(block_count, numThreads()) * numThreads());
|
||||||
|
|
||||||
// Now try to increase block size up to max_block_size as long as it
|
// Now try to increase block size up to max_block_size as long as it
|
||||||
// doesn't decrease parallel efficiency.
|
// doesn't decrease parallel efficiency.
|
||||||
@ -381,7 +381,7 @@ struct ThreadPoolDevice {
|
|||||||
max_efficiency < 1.0 && prev_block_count > 1;) {
|
max_efficiency < 1.0 && prev_block_count > 1;) {
|
||||||
// This is the next block size that divides size into a smaller number
|
// This is the next block size that divides size into a smaller number
|
||||||
// of blocks than the current block_size.
|
// of blocks than the current block_size.
|
||||||
Index coarser_block_size = divup(n, prev_block_count - 1);
|
Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1);
|
||||||
if (block_align) {
|
if (block_align) {
|
||||||
Index new_block_size = block_align(coarser_block_size);
|
Index new_block_size = block_align(coarser_block_size);
|
||||||
eigen_assert(new_block_size >= coarser_block_size);
|
eigen_assert(new_block_size >= coarser_block_size);
|
||||||
@ -391,12 +391,12 @@ struct ThreadPoolDevice {
|
|||||||
break; // Reached max block size. Stop.
|
break; // Reached max block size. Stop.
|
||||||
}
|
}
|
||||||
// Recalculate parallel efficiency.
|
// Recalculate parallel efficiency.
|
||||||
const Index coarser_block_count = divup(n, coarser_block_size);
|
const Index coarser_block_count = numext::div_ceil(n, coarser_block_size);
|
||||||
eigen_assert(coarser_block_count < prev_block_count);
|
eigen_assert(coarser_block_count < prev_block_count);
|
||||||
prev_block_count = coarser_block_count;
|
prev_block_count = coarser_block_count;
|
||||||
const double coarser_efficiency =
|
const double coarser_efficiency =
|
||||||
static_cast<double>(coarser_block_count) /
|
static_cast<double>(coarser_block_count) /
|
||||||
(divup<int>(coarser_block_count, numThreads()) * numThreads());
|
(numext::div_ceil<int>(coarser_block_count, numThreads()) * numThreads());
|
||||||
if (coarser_efficiency + 0.01 >= max_efficiency) {
|
if (coarser_efficiency + 0.01 >= max_efficiency) {
|
||||||
// Taking it.
|
// Taking it.
|
||||||
block_size = coarser_block_size;
|
block_size = coarser_block_size;
|
||||||
|
@ -261,7 +261,7 @@ TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
|
|||||||
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
|
||||||
const size_t aligned_blocksize =
|
const size_t aligned_blocksize =
|
||||||
align *
|
align *
|
||||||
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
|
numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
|
||||||
|
|
||||||
return {block_mapper, requirements.cost_per_coeff * block_size,
|
return {block_mapper, requirements.cost_per_coeff * block_size,
|
||||||
aligned_blocksize};
|
aligned_blocksize};
|
||||||
@ -661,7 +661,7 @@ EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Til
|
|||||||
block_size;
|
block_size;
|
||||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||||
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, numext::div_ceil<int>(size, block_size)), 1);
|
||||||
|
|
||||||
LAUNCH_GPU_KERNEL(
|
LAUNCH_GPU_KERNEL(
|
||||||
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
|
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
|
||||||
|
@ -27,21 +27,6 @@ const T2& choose(Cond<false>, const T1&, const T2& second) {
|
|||||||
return second;
|
return second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T, typename X, typename Y>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
||||||
T divup(const X x, const Y y) {
|
|
||||||
// Note: This form is used because it cannot overflow.
|
|
||||||
return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
|
||||||
T divup(const T x, const T y) {
|
|
||||||
// Note: This form is used because it cannot overflow.
|
|
||||||
return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <size_t n> struct max_n_1 {
|
template <size_t n> struct max_n_1 {
|
||||||
static const size_t size = n;
|
static const size_t size = n;
|
||||||
};
|
};
|
||||||
@ -49,6 +34,11 @@ template <> struct max_n_1<0> {
|
|||||||
static const size_t size = 1;
|
static const size_t size = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
constexpr T divup(const T x, const T y) {
|
||||||
|
return Eigen::numext::div_ceil(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
// Default packet types
|
// Default packet types
|
||||||
template <typename Scalar, typename Device>
|
template <typename Scalar, typename Device>
|
||||||
|
@ -414,7 +414,7 @@ struct FullReductionLauncher<
|
|||||||
typedef typename Self::Index Index;
|
typedef typename Self::Index Index;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
|
||||||
|
|
||||||
unsigned int* semaphore = NULL;
|
unsigned int* semaphore = NULL;
|
||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
@ -441,7 +441,7 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
|
|||||||
|
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
|
||||||
half* scratch = static_cast<half*>(device.scratchpad());
|
half* scratch = static_cast<half*>(device.scratchpad());
|
||||||
|
|
||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
@ -507,7 +507,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
|
|||||||
const int unroll_times = 16;
|
const int unroll_times = 16;
|
||||||
eigen_assert(NumPerThread % unroll_times == 0);
|
eigen_assert(NumPerThread % unroll_times == 0);
|
||||||
|
|
||||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
|
const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
|
||||||
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
|
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
|
||||||
|
|
||||||
const Index num_threads = blockDim.x * gridDim.x;
|
const Index num_threads = blockDim.x * gridDim.x;
|
||||||
@ -593,8 +593,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
|
|||||||
eigen_assert(NumPerThread % unroll_times == 0);
|
eigen_assert(NumPerThread % unroll_times == 0);
|
||||||
eigen_assert(unroll_times % 2 == 0);
|
eigen_assert(unroll_times % 2 == 0);
|
||||||
|
|
||||||
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
|
const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
|
||||||
const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
|
const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
|
||||||
|
|
||||||
const Index num_threads = blockDim.x * gridDim.x;
|
const Index num_threads = blockDim.x * gridDim.x;
|
||||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -785,7 +785,7 @@ struct InnerReductionLauncher<
|
|||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 128;
|
const int num_per_thread = 128;
|
||||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
|
||||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||||
@ -793,7 +793,7 @@ struct InnerReductionLauncher<
|
|||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||||
// won't be a race conditions between multiple thread blocks.
|
// won't be a race conditions between multiple thread blocks.
|
||||||
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
|
const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
|
||||||
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||||
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
||||||
@ -831,7 +831,7 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
|
|||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = /*256*/128;
|
const int block_size = /*256*/128;
|
||||||
const int num_per_thread = /*128*/64;
|
const int num_per_thread = /*128*/64;
|
||||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
|
||||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||||
@ -900,7 +900,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reduce
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Do the reduction.
|
// Do the reduction.
|
||||||
const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
|
const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
|
||||||
for (Index i = thread_id; i < max_iter; i += num_threads) {
|
for (Index i = thread_id; i < max_iter; i += num_threads) {
|
||||||
const Index input_col = i % num_preserved_coeffs;
|
const Index input_col = i % num_preserved_coeffs;
|
||||||
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
|
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
|
||||||
@ -953,7 +953,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
|
||||||
const int block_size = 256;
|
const int block_size = 256;
|
||||||
const int num_per_thread = 16;
|
const int num_per_thread = 16;
|
||||||
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
|
const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
|
||||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
device.maxGpuThreadsPerMultiProcessor() / block_size;
|
||||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||||
@ -961,7 +961,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
if (num_blocks > 1) {
|
if (num_blocks > 1) {
|
||||||
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
||||||
// about race conditions between multiple thread blocks.
|
// about race conditions between multiple thread blocks.
|
||||||
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
|
const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
|
||||||
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
||||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||||
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
||||||
|
@ -215,7 +215,7 @@ EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
|
|||||||
EIGEN_CONSTEXPR Index kBlockAlignment = 128;
|
EIGEN_CONSTEXPR Index kBlockAlignment = 128;
|
||||||
const Index items_per_cacheline =
|
const Index items_per_cacheline =
|
||||||
numext::maxi<Index>(1, kBlockAlignment / item_size);
|
numext::maxi<Index>(1, kBlockAlignment / item_size);
|
||||||
return items_per_cacheline * divup(block_size, items_per_cacheline);
|
return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Self>
|
template <typename Self>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user