diff --git a/Eigen/src/LU/PartialLU.h b/Eigen/src/LU/PartialLU.h index 337438e2f..1cca1c7db 100644 --- a/Eigen/src/LU/PartialLU.h +++ b/Eigen/src/LU/PartialLU.h @@ -201,98 +201,153 @@ PartialLU::PartialLU(const MatrixType& matrix) compute(matrix); } -/** \internal performs the LU decomposition in place of the matrix \a lu. - * In addition, this function returns the row transpositions in the - * vector \a row_transpositions which must have a size equal to the number - * of columns of the matrix \a lu, and an integer \a nb_transpositions - * which returns the actual number of transpositions. - */ -template -void ei_lu_unblocked(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions) -{ - const int rows = lu.rows(); - const int size = std::min(lu.rows(),lu.cols()); - nb_transpositions = 0; - for(int k = 0; k < size; ++k) - { - int row_of_biggest_in_col; - lu.block(k,k,rows-k,1).cwise().abs().maxCoeff(&row_of_biggest_in_col); - row_of_biggest_in_col += k; - row_transpositions.coeffRef(k) = row_of_biggest_in_col; - - if(k != row_of_biggest_in_col) - { - lu.row(k).swap(lu.row(row_of_biggest_in_col)); - ++nb_transpositions; - } - - if(k -void ei_lu_blocked(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions) +template +struct ei_partial_lu_impl { - const int size = lu.rows(); - - // automatically adjust the number of subdivisions to the size - // of the matrix so that there is enough sub blocks: - int blockSize = size/8; - blockSize = (blockSize/16)*16; - blockSize = std::min(std::max(blockSize,8), 256); - // if the matrix is too small, no blocking: - if(size<32) - blockSize = size; - - nb_transpositions = 0; - for(int k = 0; k < size; k+=blockSize) - { - int bs = std::min(size-k,blockSize); - int ps = size - k; - int rs = size - k - bs; - // partition the matrix: - // A00 | A01 | A02 - // lu = A10 | A11 | A12 - // A20 | A21 | A22 - Block A_0(lu,0,0,size,k); - Block A11_21(lu,k,k,ps,bs); - Block A_2(lu,0,k+bs,size,rs); - Block A11(lu,k,k,bs,bs); - Block A12(lu,k,k+bs,bs,rs); - Block A21(lu,k+bs,k,rs,bs); - Block A22(lu,k+bs,k+bs,rs,rs); + // FIXME add a stride to Map, so that the following mapping becomes easier, + // another option would be to create an expression being able to automatically + // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly + // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, + // and Block. + typedef Map > MapLU; + typedef Block MatrixType; + typedef Block BlockType; - VectorBlock row_transpositions_in_panel(row_transpositions,k,bs); - int nb_transpositions_in_panel; - ei_lu_unblocked(A11_21, row_transpositions_in_panel, nb_transpositions_in_panel); - nb_transpositions_in_panel += nb_transpositions_in_panel; - - // update permutations and apply them to A10 - for(int i=k;i().solveInPlace(A12); + if(k != row_of_biggest_in_col) + { + lu.row(k).swap(lu.row(row_of_biggest_in_col)); + ++nb_transpositions; + } - A22 -= A21 * A12; + if(k > > + */ + static void blocked_lu(int rows, int cols, Scalar* lu_data, int luStride, int* row_transpositions, int& nb_transpositions, int maxBlockSize=256) + { + MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); + MatrixType lu(lu1,0,0,rows,cols); + + + const int size = std::min(rows,cols); + + // if the matrix is too small, no blocking: + if(size<=16) + { + unblocked_lu(lu, row_transpositions, nb_transpositions); + return; + } + + // automatically adjust the number of subdivisions to the size + // of the matrix so that there is enough sub blocks: + int blockSize; + { + blockSize = size/8; + blockSize = (blockSize/16)*16; + blockSize = std::min(std::max(blockSize,8), maxBlockSize); + } + + nb_transpositions = 0; + for(int k = 0; k < size; k+=blockSize) + { + int bs = std::min(size-k,blockSize); + int ps = size - k; + int rs = size - k - bs; + // partition the matrix: + // A00 | A01 | A02 + // lu = A10 | A11 | A12 + // A20 | A21 | A22 + BlockType A_0(lu,0,0,size,k); + BlockType A_2(lu,0,k+bs,size,rs); + BlockType A11(lu,k,k,bs,bs); + BlockType A12(lu,k,k+bs,bs,rs); + BlockType A21(lu,k+bs,k,rs,bs); + BlockType A22(lu,k+bs,k+bs,rs,rs); + + int nb_transpositions_in_panel; + // recursively calls the blocked LU algorithm with a very small + // blocking size: + blocked_lu(ps, bs, &lu.coeffRef(k,k), luStride, + row_transpositions+k, nb_transpositions_in_panel, 16); + nb_transpositions_in_panel += nb_transpositions_in_panel; + + // update permutations and apply them to A10 + for(int i=k;i().solveInPlace(A12); + + A22 -= A21 * A12; + } + } + } +}; + +/** \internal performs the LU decomposition with partial pivoting in-place. + */ +template +void ei_partial_lu_inplace(MatrixType& lu, IntVector& row_transpositions, int& nb_transpositions) +{ + ei_assert(lu.cols() == row_transpositions.size()); + ei_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + + ei_partial_lu_impl + + ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.stride(), &row_transpositions.coeffRef(0), nb_transpositions); } template @@ -307,7 +362,7 @@ void PartialLU::compute(const MatrixType& matrix) IntColVectorType rows_transpositions(size); int nb_transpositions; - ei_lu_blocked(m_lu, rows_transpositions, nb_transpositions); + ei_partial_lu_inplace(m_lu, rows_transpositions, nb_transpositions); m_det_p = (nb_transpositions%2) ? -1 : 1; for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k; diff --git a/test/inverse.cpp b/test/inverse.cpp index b4eef73b6..65dfbc73e 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -81,13 +81,16 @@ template void inverse(const MatrixType& m) void test_inverse() { + int s; for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST( inverse(Matrix()) ); CALL_SUBTEST( inverse(Matrix2d()) ); CALL_SUBTEST( inverse(Matrix3f()) ); CALL_SUBTEST( inverse(Matrix4f()) ); - CALL_SUBTEST( inverse(MatrixXf(72,72)) ); - CALL_SUBTEST( inverse(MatrixXcd(56,56)) ); + s = ei_random(50,320); + CALL_SUBTEST( inverse(MatrixXf(s,s)) ); + s = ei_random(25,100); + CALL_SUBTEST( inverse(MatrixXcd(s,s)) ); } // test some tricky cases for 4x4 matrices