fix computation of blocking sizes for small triangular matrices

This commit is contained in:
Gael Guennebaud 2010-06-24 11:50:28 +02:00
parent 0068d3ccf6
commit d44fce501b
3 changed files with 12 additions and 14 deletions

View File

@ -101,7 +101,7 @@ inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
* - the number of scalars that fit into a packet (when vectorization is enabled). * - the number of scalars that fit into a packet (when vectorization is enabled).
* *
* \sa setCpuCacheSizes */ * \sa setCpuCacheSizes */
template<typename LhsScalar, typename RhsScalar> template<typename LhsScalar, typename RhsScalar, int KcFactor>
void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
{ {
// Explanations: // Explanations:
@ -114,7 +114,7 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
std::ptrdiff_t l1, l2; std::ptrdiff_t l1, l2;
enum { enum {
kdiv = 2 * ei_product_blocking_traits<RhsScalar>::nr kdiv = KcFactor * 2 * ei_product_blocking_traits<RhsScalar>::nr
* ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar), * ei_packet_traits<RhsScalar>::size * sizeof(RhsScalar),
mr = ei_product_blocking_traits<LhsScalar>::mr, mr = ei_product_blocking_traits<LhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr mr_mask = (0xffffffff/mr)*mr
@ -127,6 +127,12 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
n = n; n = n;
} }
template<typename LhsScalar, typename RhsScalar>
inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
}
#ifdef EIGEN_HAS_FUSE_CJMADD #ifdef EIGEN_HAS_FUSE_CJMADD
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
#else #else

View File

@ -117,9 +117,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index kc = depth; // cache block size along the K direction Index kc = depth; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
// it is better to use smaller blocks along the diagonal
kc /= 4;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@ -245,9 +243,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index kc = depth; // cache block size along the K direction Index kc = depth; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
// it is better to use smaller blocks along the diagonal
kc /= 4;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;

View File

@ -66,9 +66,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStora
Index kc = size; // cache block size along the K direction Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction Index mc = size; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
// it is better to use smaller blocks along the diagonal
kc /= 4;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols; std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
@ -206,9 +204,7 @@ struct ei_triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStor
Index kc = size; // cache block size along the K direction Index kc = size; // cache block size along the K direction
Index mc = size; // cache block size along the M direction Index mc = size; // cache block size along the M direction
Index nc = rows; // cache block size along the N direction Index nc = rows; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc);
// it is better to use smaller blocks along the diagonal
kc /= 4;
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size; std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;