mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Add a simple cost model to prevent Eigen's parallel GEMM from using too many threads when the inner dimension is small.
Timing for square matrices is unchanged, but both CPU and Wall time are significantly improved for skinny matrices. The benchmarks below are for multiplying NxK * KxN matrices with test names of the form BM_OuterishProd/N/K. Improvements in Wall time: Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00 CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_OuterishProd/64/1 3088 1610 +47.9% BM_OuterishProd/64/4 3562 2414 +32.2% BM_OuterishProd/64/32 8861 7815 +11.8% BM_OuterishProd/128/1 11363 6504 +42.8% BM_OuterishProd/128/4 11128 9794 +12.0% BM_OuterishProd/128/64 27691 27396 +1.1% BM_OuterishProd/256/1 33214 28123 +15.3% BM_OuterishProd/256/4 34312 36818 -7.3% BM_OuterishProd/256/128 174866 176398 -0.9% BM_OuterishProd/512/1 7963684 104224 +98.7% BM_OuterishProd/512/4 7987913 112867 +98.6% BM_OuterishProd/512/256 8198378 1306500 +84.1% BM_OuterishProd/1k/1 7356256 324432 +95.6% BM_OuterishProd/1k/4 8129616 331621 +95.9% BM_OuterishProd/1k/512 27265418 7517538 +72.4% Improvements in CPU time: Run on [redacted] (12 X 3501 MHz CPUs); 2016-10-05T17:40:02.462497196-07:00 CPU: Intel Haswell with HyperThreading (6 cores) dL1:32KB dL2:256KB dL3:15MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_OuterishProd/64/1 6169 1608 +73.9% BM_OuterishProd/64/4 7117 2412 +66.1% BM_OuterishProd/64/32 17702 15616 +11.8% BM_OuterishProd/128/1 45415 6498 +85.7% BM_OuterishProd/128/4 44459 9786 +78.0% BM_OuterishProd/128/64 110657 109489 +1.1% BM_OuterishProd/256/1 265158 28101 +89.4% BM_OuterishProd/256/4 274234 183885 +32.9% BM_OuterishProd/256/128 1397160 1408776 -0.8% BM_OuterishProd/512/1 78947048 520703 +99.3% BM_OuterishProd/512/4 86955578 1349742 +98.4% BM_OuterishProd/512/256 74701613 15584661 +79.1% BM_OuterishProd/1k/1 78352601 3877911 +95.1% BM_OuterishProd/1k/4 78521643 3966221 +94.9% BM_OuterishProd/1k/512 258104736 89480530 +65.3%
This commit is contained in:
parent
80b5133789
commit
48c635e223
@ -10,7 +10,7 @@
|
|||||||
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
|
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||||
#define EIGEN_GENERAL_MATRIX_MATRIX_H
|
#define EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
@ -24,7 +24,7 @@ template<
|
|||||||
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
|
||||||
{
|
{
|
||||||
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
typedef gebp_traits<RhsScalar,LhsScalar> Traits;
|
||||||
|
|
||||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||||
static EIGEN_STRONG_INLINE void run(
|
static EIGEN_STRONG_INLINE void run(
|
||||||
Index rows, Index cols, Index depth,
|
Index rows, Index cols, Index depth,
|
||||||
@ -54,7 +54,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
|
|||||||
{
|
{
|
||||||
|
|
||||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||||
|
|
||||||
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||||
static void run(Index rows, Index cols, Index depth,
|
static void run(Index rows, Index cols, Index depth,
|
||||||
const LhsScalar* _lhs, Index lhsStride,
|
const LhsScalar* _lhs, Index lhsStride,
|
||||||
@ -85,13 +85,13 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
// this is the parallel version!
|
// this is the parallel version!
|
||||||
Index tid = omp_get_thread_num();
|
Index tid = omp_get_thread_num();
|
||||||
Index threads = omp_get_num_threads();
|
Index threads = omp_get_num_threads();
|
||||||
|
|
||||||
LhsScalar* blockA = blocking.blockA();
|
LhsScalar* blockA = blocking.blockA();
|
||||||
eigen_internal_assert(blockA!=0);
|
eigen_internal_assert(blockA!=0);
|
||||||
|
|
||||||
std::size_t sizeB = kc*nc;
|
std::size_t sizeB = kc*nc;
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
|
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
|
||||||
|
|
||||||
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
|
// For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
|
||||||
for(Index k=0; k<depth; k+=kc)
|
for(Index k=0; k<depth; k+=kc)
|
||||||
{
|
{
|
||||||
@ -114,7 +114,7 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
|
|
||||||
// Notify the other threads that the part A'_i is ready to go.
|
// Notify the other threads that the part A'_i is ready to go.
|
||||||
info[tid].sync = k;
|
info[tid].sync = k;
|
||||||
|
|
||||||
// Computes C_i += A' * B' per A'_i
|
// Computes C_i += A' * B' per A'_i
|
||||||
for(Index shift=0; shift<threads; ++shift)
|
for(Index shift=0; shift<threads; ++shift)
|
||||||
{
|
{
|
||||||
@ -161,7 +161,7 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
||||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||||
|
|
||||||
const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
|
const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
|
||||||
|
|
||||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||||
@ -172,24 +172,24 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
for(Index k2=0; k2<depth; k2+=kc)
|
for(Index k2=0; k2<depth; k2+=kc)
|
||||||
{
|
{
|
||||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||||
|
|
||||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||||
// horizontal panel which is, in practice, a very low number.
|
// horizontal panel which is, in practice, a very low number.
|
||||||
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
|
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
|
||||||
|
|
||||||
// For each kc x nc block of the rhs's horizontal panel...
|
// For each kc x nc block of the rhs's horizontal panel...
|
||||||
for(Index j2=0; j2<cols; j2+=nc)
|
for(Index j2=0; j2<cols; j2+=nc)
|
||||||
{
|
{
|
||||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||||
|
|
||||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||||
if((!pack_rhs_once) || i2==0)
|
if((!pack_rhs_once) || i2==0)
|
||||||
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
||||||
|
|
||||||
// Everything is packed, we can now call the panel * block kernel:
|
// Everything is packed, we can now call the panel * block kernel:
|
||||||
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
|
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
|
||||||
}
|
}
|
||||||
@ -229,7 +229,7 @@ struct gemm_functor
|
|||||||
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
|
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
|
||||||
m_actualAlpha, m_blocking, info);
|
m_actualAlpha, m_blocking, info);
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef typename Gemm::Traits Traits;
|
typedef typename Gemm::Traits Traits;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
@ -313,7 +313,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
|
this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void initParallel(Index, Index, Index, Index)
|
void initParallel(Index, Index, Index, Index)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
@ -359,14 +359,14 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
m_sizeA = this->m_mc * this->m_kc;
|
m_sizeA = this->m_mc * this->m_kc;
|
||||||
m_sizeB = this->m_kc * this->m_nc;
|
m_sizeB = this->m_kc * this->m_nc;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initParallel(Index rows, Index cols, Index depth, Index num_threads)
|
void initParallel(Index rows, Index cols, Index depth, Index num_threads)
|
||||||
{
|
{
|
||||||
this->m_mc = Transpose ? cols : rows;
|
this->m_mc = Transpose ? cols : rows;
|
||||||
this->m_nc = Transpose ? rows : cols;
|
this->m_nc = Transpose ? rows : cols;
|
||||||
this->m_kc = depth;
|
this->m_kc = depth;
|
||||||
|
|
||||||
eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
|
eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);
|
||||||
Index m = this->m_mc;
|
Index m = this->m_mc;
|
||||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
|
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
|
||||||
m_sizeA = this->m_mc * this->m_kc;
|
m_sizeA = this->m_mc * this->m_kc;
|
||||||
@ -401,7 +401,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
|||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs>
|
template<typename Lhs, typename Rhs>
|
||||||
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||||
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
|
: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
|
||||||
@ -409,21 +409,21 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||||
typedef typename Lhs::Scalar LhsScalar;
|
typedef typename Lhs::Scalar LhsScalar;
|
||||||
typedef typename Rhs::Scalar RhsScalar;
|
typedef typename Rhs::Scalar RhsScalar;
|
||||||
|
|
||||||
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
typedef internal::blas_traits<Lhs> LhsBlasTraits;
|
||||||
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
|
typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
|
||||||
typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
|
typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
|
||||||
|
|
||||||
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
typedef internal::blas_traits<Rhs> RhsBlasTraits;
|
||||||
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
|
typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
|
||||||
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
|
typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
|
MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
|
typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
|
||||||
|
|
||||||
template<typename Dst>
|
template<typename Dst>
|
||||||
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||||
{
|
{
|
||||||
@ -453,7 +453,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
else
|
else
|
||||||
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Dest>
|
template<typename Dest>
|
||||||
static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
|
static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
|
||||||
{
|
{
|
||||||
@ -481,7 +481,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
|||||||
|
|
||||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
||||||
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
|
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
#ifndef EIGEN_PARALLELIZER_H
|
#ifndef EIGEN_PARALLELIZER_H
|
||||||
#define EIGEN_PARALLELIZER_H
|
#define EIGEN_PARALLELIZER_H
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ template<typename Index> struct GemmParallelInfo
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<bool Condition, typename Functor, typename Index>
|
template<bool Condition, typename Functor, typename Index>
|
||||||
void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
|
void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose)
|
||||||
{
|
{
|
||||||
// TODO when EIGEN_USE_BLAS is defined,
|
// TODO when EIGEN_USE_BLAS is defined,
|
||||||
// we should still enable OMP for other scalar types
|
// we should still enable OMP for other scalar types
|
||||||
@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
// the matrix product when multithreading is enabled. This is a temporary
|
// the matrix product when multithreading is enabled. This is a temporary
|
||||||
// fix to support row-major destination matrices. This whole
|
// fix to support row-major destination matrices. This whole
|
||||||
// parallelizer mechanism has to be redisigned anyway.
|
// parallelizer mechanism has to be redisigned anyway.
|
||||||
|
EIGEN_UNUSED_VARIABLE(depth);
|
||||||
EIGEN_UNUSED_VARIABLE(transpose);
|
EIGEN_UNUSED_VARIABLE(transpose);
|
||||||
func(0,rows, 0,cols);
|
func(0,rows, 0,cols);
|
||||||
#else
|
#else
|
||||||
@ -106,6 +107,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
// FIXME this has to be fine tuned
|
// FIXME this has to be fine tuned
|
||||||
Index size = transpose ? rows : cols;
|
Index size = transpose ? rows : cols;
|
||||||
Index pb_max_threads = std::max<Index>(1,size / 32);
|
Index pb_max_threads = std::max<Index>(1,size / 32);
|
||||||
|
// compute the maximal number of threads from the total amount of work:
|
||||||
|
double work = static_cast<double>(rows) * static_cast<double>(cols) *
|
||||||
|
static_cast<double>(depth);
|
||||||
|
double kMinTaskSize = 50000; // Heuristic.
|
||||||
|
max_threads = std::max<Index>(1, std::min<Index>(max_threads, work / kMinTaskSize));
|
||||||
|
|
||||||
// compute the number of threads we are going to use
|
// compute the number of threads we are going to use
|
||||||
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
Index threads = std::min<Index>(nbThreads(), pb_max_threads);
|
||||||
|
|
||||||
@ -120,19 +127,19 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
|
|||||||
|
|
||||||
if(transpose)
|
if(transpose)
|
||||||
std::swap(rows,cols);
|
std::swap(rows,cols);
|
||||||
|
|
||||||
ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
|
ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
|
||||||
|
|
||||||
#pragma omp parallel num_threads(threads)
|
#pragma omp parallel num_threads(threads)
|
||||||
{
|
{
|
||||||
Index i = omp_get_thread_num();
|
Index i = omp_get_thread_num();
|
||||||
// Note that the actual number of threads might be lower than the number of request ones.
|
// Note that the actual number of threads might be lower than the number of request ones.
|
||||||
Index actual_threads = omp_get_num_threads();
|
Index actual_threads = omp_get_num_threads();
|
||||||
|
|
||||||
Index blockCols = (cols / actual_threads) & ~Index(0x3);
|
Index blockCols = (cols / actual_threads) & ~Index(0x3);
|
||||||
Index blockRows = (rows / actual_threads);
|
Index blockRows = (rows / actual_threads);
|
||||||
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
|
blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
|
||||||
|
|
||||||
Index r0 = i*blockRows;
|
Index r0 = i*blockRows;
|
||||||
Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
|
Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user