diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 3d801e9ed..26bfa40e2 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1341,6 +1341,19 @@ double ceil(const double &x) { return ::ceil(x); } #endif +// Integer division with rounding up. +// T is assumed to be an integer type with a>=0, and b>0 +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR +T div_ceil(const T &a, const T &b) +{ + EIGEN_STATIC_ASSERT((NumTraits::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES) + eigen_assert(a >= 0); + eigen_assert(b > 0); + // Note: This form is used because it cannot overflow. + return a == 0 ? 0 : (a - 1) / b + 1; +} + /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ inline int log2(int x) diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h index 7220bfa9e..2df17040a 100644 --- a/Eigen/src/Core/arch/AVX512/GemmKernel.h +++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h @@ -90,8 +90,6 @@ class gemm_class { const Index a_stride, b_stride; const Index a_off, b_off; - static EIGEN_ALWAYS_INLINE constexpr int div_up(int a, int b) { return a == 0 ? 0 : (a - 1) / b + 1; } - EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) { _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0); } @@ -479,7 +477,7 @@ class gemm_class { * * const Scalar *cox = (idx == 0) ? co1 : co2; * - * const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + * const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); * scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); * write_c<0, um_vecs, idx, a_unroll>(cox); * @@ -498,7 +496,7 @@ class gemm_class { EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) { if (pow >= 4) cox += ldc; - const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); auto &alpha_reg = zmm[alpha_load_reg]; scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg); @@ -644,7 +642,7 @@ class gemm_class { template EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo, Scalar *&co2, int &fetchA_idx, int &fetchB_idx) { - const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); if (max_b_unroll >= 1) innerkernel_1pow(aa, ao, bo, co2, fetchA_idx, fetchB_idx); @@ -729,7 +727,7 @@ class gemm_class { template EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) { - const int um_vecs = div_up(a_unroll, nelems_in_cache_line); + const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line); if (!use_less_a_regs && k > 1) a_loads<0, 2, 0, um_vecs, a_unroll>(ao); else diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index fe6e5de86..8e4c278ff 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -422,15 +422,6 @@ template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); using std::numeric_limits; -// Integer division with rounding up. -// T is assumed to be an integer type with a>=0, and b>0 -template -EIGEN_DEVICE_FUNC -T div_ceil(const T &a, const T &b) -{ - return (a+b-1) / b; -} - // Handle integer comparisons of different signedness. template ::IsInteger, bool XIsSigned = NumTraits::IsSigned, bool YIsInteger = NumTraits::IsInteger, bool YIsSigned = NumTraits::IsSigned> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index aa460ba4f..4087a8bfe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -443,7 +443,7 @@ class TensorBlockMapper { const int dim = isColMajor ? i : NumDims - i - 1; m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = divup( + coeff_to_allocate = numext::div_ceil( coeff_to_allocate, numext::maxi(static_cast(1), m_block_dimensions[dim])); } @@ -474,7 +474,7 @@ class TensorBlockMapper { const IndexType total_size_other_dims = total_size / m_block_dimensions[dim]; const IndexType alloc_avail = - divup(target_block_size, total_size_other_dims); + numext::div_ceil(target_block_size, total_size_other_dims); if (alloc_avail == m_block_dimensions[dim]) { // Insufficient excess coefficients to allocate. break; @@ -496,7 +496,7 @@ class TensorBlockMapper { // Calculate block counts by dimension and total block count. DSizes block_count; for (int i = 0; i < NumDims; ++i) { - block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); + block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]); } m_total_block_count = array_prod(block_count); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 1cf5035b8..59e75e557 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -898,7 +898,7 @@ struct TensorEvaluator, Device> // First multiple after a. This is b when <= bcast_dim_left_index + // bcast_dim_size. const Index first_multiple = - divup(bcast_dim_left_index, input_bcast_dim_size) * + numext::div_ceil(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size; if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f0520e8f7..ec34885f7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -144,8 +144,8 @@ struct TensorContractionBlockMemAllocator { const Index bn) { Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); BlockSizes sz; - sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; - sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; + sz.lhs_size = numext::div_ceil(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = numext::div_ceil(bn * bk * sizeof(RhsScalar), align) * align; return sz; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 308c23bda..757680868 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -206,9 +206,9 @@ struct TensorEvaluator(k, block_size)), + num_blocks(numext::div_ceil(k, block_size)), num_pending_blocks(internal::convert_index(num_blocks)), - l0_ranges(divup(num_blocks, l0_size)), + l0_ranges(numext::div_ceil(num_blocks, l0_size)), l0_state(l0_ranges), block_buffers(num_blocks) { // Keep count of pending gemm tasks for each l0 range. @@ -1434,10 +1434,10 @@ struct TensorEvaluator Index { const Index kmultiple = packet_size <= 8 ? 8 : packet_size; - return divup(index, kmultiple) * kmultiple; + return numext::div_ceil(index, kmultiple) * kmultiple; }; - const Index target_block_size = round_up(divup(k, num_threads)); + const Index target_block_size = round_up(numext::div_ceil(k, num_threads)); const Index desired_min_block_size = 12 * packet_size; return numext::mini( @@ -1485,19 +1485,19 @@ struct TensorEvaluator nm0) break; // Check the candidate. int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col); if (res < 0) break; - nm1 = divup(nm0, gm1); + nm1 = numext::div_ceil(nm0, gm1); if (res == 0) continue; // Commit new grain size. gm = gm1; @@ -1509,15 +1509,15 @@ struct TensorEvaluator nn0) break; int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col); if (res < 0) break; - nn1 = divup(nn0, gn1); + nn1 = numext::div_ceil(nn0, gn1); if (res == 0) continue; gn = gn1; } @@ -1544,14 +1544,14 @@ struct TensorEvaluator(new_tasks) / - (divup(new_tasks, num_threads) * num_threads); - Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + (numext::div_ceil(new_tasks, num_threads) * num_threads); + Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn); double old_parallelism = static_cast(old_tasks) / - (divup(old_tasks, num_threads) * num_threads); + (numext::div_ceil(old_tasks, num_threads) * num_threads); if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; return 0; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 6c7ad670f..53b66c02e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -223,7 +223,7 @@ struct ThreadPoolDevice { Index lastIdx) { while (lastIdx - firstIdx > block.size) { // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); lastIdx = midIdx; } @@ -282,7 +282,7 @@ struct ThreadPoolDevice { ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { while (lastIdx - firstIdx > block.size) { // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size; pool_->Schedule( [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); lastIdx = midIdx; @@ -357,7 +357,7 @@ struct ThreadPoolDevice { const Index max_oversharding_factor = 4; Index block_size = numext::mini( n, numext::maxi( - divup(n, max_oversharding_factor * numThreads()), + numext::div_ceil(n, max_oversharding_factor * numThreads()), block_size_f)); const Index max_block_size = numext::mini(n, 2 * block_size); @@ -367,13 +367,13 @@ struct ThreadPoolDevice { block_size = numext::mini(n, new_block_size); } - Index block_count = divup(n, block_size); + Index block_count = numext::div_ceil(n, block_size); // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast(block_count) / - (divup(block_count, numThreads()) * numThreads()); + (numext::div_ceil(block_count, numThreads()) * numThreads()); // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. @@ -381,7 +381,7 @@ struct ThreadPoolDevice { max_efficiency < 1.0 && prev_block_count > 1;) { // This is the next block size that divides size into a smaller number // of blocks than the current block_size. - Index coarser_block_size = divup(n, prev_block_count - 1); + Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1); if (block_align) { Index new_block_size = block_align(coarser_block_size); eigen_assert(new_block_size >= coarser_block_size); @@ -391,12 +391,12 @@ struct ThreadPoolDevice { break; // Reached max block size. Stop. } // Recalculate parallel efficiency. - const Index coarser_block_count = divup(n, coarser_block_size); + const Index coarser_block_count = numext::div_ceil(n, coarser_block_size); eigen_assert(coarser_block_count < prev_block_count); prev_block_count = coarser_block_count; const double coarser_efficiency = static_cast(coarser_block_count) / - (divup(coarser_block_count, numThreads()) * numThreads()); + (numext::div_ceil(coarser_block_count, numThreads()) * numThreads()); if (coarser_efficiency + 0.01 >= max_efficiency) { // Taking it. block_size = coarser_block_size; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 461abe44e..4eebbe70f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -261,7 +261,7 @@ TensorExecutorTilingContext GetTensorExecutorTilingContext( const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); const size_t aligned_blocksize = align * - divup(block_size * sizeof(typename Evaluator::Scalar), align); + numext::div_ceil(block_size * sizeof(typename Evaluator::Scalar), align); return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize}; @@ -661,7 +661,7 @@ EIGEN_STRONG_INLINE void TensorExecutor(numext::mini(max_blocks, divup(size, block_size)), 1); + const int num_blocks = numext::maxi(numext::mini(max_blocks, numext::div_ceil(size, block_size)), 1); LAUNCH_GPU_KERNEL( (EigenMetaKernel, StorageIndex>), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b7c2cb856..524432e6a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -27,21 +27,6 @@ const T2& choose(Cond, const T1&, const T2& second) { return second; } - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const X x, const Y y) { - // Note: This form is used because it cannot overflow. - return static_cast(x == 0 ? 0 : (x - 1) / y + 1); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const T x, const T y) { - // Note: This form is used because it cannot overflow. - return static_cast(x == 0 ? 0 : (x - 1) / y + 1); -} - template struct max_n_1 { static const size_t size = n; }; @@ -49,6 +34,11 @@ template <> struct max_n_1<0> { static const size_t size = 1; }; +template +EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +constexpr T divup(const T x, const T y) { + return Eigen::numext::div_ceil(x, y); +} // Default packet types template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index aee86fd0f..7348a7188 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -414,7 +414,7 @@ struct FullReductionLauncher< typedef typename Self::Index Index; const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); unsigned int* semaphore = NULL; if (num_blocks > 1) { @@ -441,7 +441,7 @@ struct FullReductionLauncher { const int block_size = 256; const int num_per_thread = 128; - const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + const int num_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); half* scratch = static_cast(device.scratchpad()); if (num_blocks > 1) { @@ -507,7 +507,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce const int unroll_times = 16; eigen_assert(NumPerThread % unroll_times == 0); - const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread); const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; const Index num_threads = blockDim.x * gridDim.x; @@ -593,8 +593,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc eigen_assert(NumPerThread % unroll_times == 0); eigen_assert(unroll_times % 2 == 0); - const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); - const Index num_input_blocks = divup(input_col_blocks * num_preserved_coeffs, 2); + const Index input_col_blocks = numext::div_ceil(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = numext::div_ceil(input_col_blocks * num_preserved_coeffs, 2); const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; @@ -785,7 +785,7 @@ struct InnerReductionLauncher< const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 128; - const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); @@ -793,7 +793,7 @@ struct InnerReductionLauncher< if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); @@ -831,7 +831,7 @@ struct InnerReductionLauncher { const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = /*256*/128; const int num_per_thread = /*128*/64; - const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); @@ -900,7 +900,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reduce } // Do the reduction. - const Index max_iter = num_preserved_coeffs * divup(num_coeffs_to_reduce, NumPerThread); + const Index max_iter = num_preserved_coeffs * numext::div_ceil(num_coeffs_to_reduce, NumPerThread); for (Index i = thread_id; i < max_iter; i += num_threads) { const Index input_col = i % num_preserved_coeffs; const Index input_row = (i / num_preserved_coeffs) * NumPerThread; @@ -953,7 +953,7 @@ struct OuterReducer { const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; - const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int dyn_blocks = numext::div_ceil(num_coeffs, block_size * num_per_thread); const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size; const int num_blocks = numext::mini(max_blocks, dyn_blocks); @@ -961,7 +961,7 @@ struct OuterReducer { if (num_blocks > 1) { // We initialize the outputs in the reduction kernel itself when we don't have to worry // about race conditions between multiple thread blocks. - const int dyn_blocks2 = divup(num_preserved_vals, 1024); + const int dyn_blocks2 = numext::div_ceil(num_preserved_vals, 1024); const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024; const int num_blocks2 = numext::mini(max_blocks2, dyn_blocks2); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 169a7a2ae..4f4a93ed1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -215,7 +215,7 @@ EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { EIGEN_CONSTEXPR Index kBlockAlignment = 128; const Index items_per_cacheline = numext::maxi(1, kBlockAlignment / item_size); - return items_per_cacheline * divup(block_size, items_per_cacheline); + return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline); } template