This commit is contained in:
Gael Guennebaud 2015-03-04 10:18:08 +01:00
commit 8fdcaded5e
10 changed files with 323 additions and 108 deletions

View File

@ -272,7 +272,7 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
}
//---------- double ----------
#if EIGEN_ARCH_ARM64
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);

View File

@ -55,6 +55,34 @@ struct functor_traits<scalar_abs_op<Scalar> >
};
};
/** \internal
* \brief Template functor to compute the score of a scalar, to chose a pivot
*
* \sa class CwiseUnaryOp
*/
template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
{
typedef void Score_is_abs;
};
template<typename Scalar>
struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor. */
template<typename Scalar, typename=void> struct abs_knowing_score
{
EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
typedef typename NumTraits<Scalar>::Real result_type;
template<typename Score>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
};
template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
{
EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
typedef typename NumTraits<Scalar>::Real result_type;
template<typename Scal>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
};
/** \internal
* \brief Template functor to compute the squared absolute value of a scalar
*

View File

@ -86,19 +86,21 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
EIGEN_UNUSED_VARIABLE(num_threads);
enum {
kr = 8,
mr = Traits::mr,
nr = Traits::nr
};
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
if (k > kr) k -= k % kr;
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
if (m > mr) m -= m % mr;
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
if (n > nr) n -= n % nr;
return;
if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
EIGEN_UNUSED_VARIABLE(num_threads);
enum {
kr = 8,
mr = Traits::mr,
nr = Traits::nr
};
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
if (k > kr) k -= k % kr;
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
if (m > mr) m -= m % mr;
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
if (n > nr) n -= n % nr;
return;
}
#endif
// Explanations:
@ -1479,17 +1481,17 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
RhsPacket B_0;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B_0); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
} while(false);
EIGEN_GEBGP_ONESTEP(0);
@ -1504,7 +1506,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
blB += pk*RhsProgress;
blA += pk*1*Traits::LhsProgress;
EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
}
// process remaining peeled loop

View File

@ -459,14 +459,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
// biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
Index row_of_biggest_in_corner, col_of_biggest_in_corner;
RealScalar biggest_in_corner;
typedef internal::scalar_score_coeff_op<Scalar> Scoring;
typedef typename Scoring::result_type Score;
Score biggest_in_corner;
biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
.cwiseAbs()
.unaryExpr(Scoring())
.maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
col_of_biggest_in_corner += k; // need to add k to them.
if(biggest_in_corner==RealScalar(0))
if(biggest_in_corner==Score(0))
{
// before exiting, make sure to initialize the still uninitialized transpositions
// in a sane state without destroying what we already have.
@ -479,7 +481,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
break;
}
if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
// Now that we've found the pivot, we need to apply the row/col swaps to
// bring it to the location (k,k).

View File

@ -275,6 +275,8 @@ struct partial_lu_impl
*/
static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
{
typedef scalar_score_coeff_op<Scalar> Scoring;
typedef typename Scoring::result_type Score;
const Index rows = lu.rows();
const Index cols = lu.cols();
const Index size = (std::min)(rows,cols);
@ -286,13 +288,13 @@ struct partial_lu_impl
Index rcols = cols-k-1;
Index row_of_biggest_in_col;
RealScalar biggest_in_corner
= lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
Score biggest_in_corner
= lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
row_of_biggest_in_col += k;
row_transpositions[k] = PivIndex(row_of_biggest_in_col);
if(biggest_in_corner != RealScalar(0))
if(biggest_in_corner != Score(0))
{
if(k != row_of_biggest_in_col)
{

View File

@ -443,13 +443,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
for (Index k = 0; k < size; ++k)
{
Index row_of_biggest_in_corner, col_of_biggest_in_corner;
RealScalar biggest_in_corner;
typedef internal::scalar_score_coeff_op<Scalar> Scoring;
typedef typename Scoring::result_type Score;
biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
.cwiseAbs()
.maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
Score score = m_qr.bottomRightCorner(rows-k, cols-k)
.unaryExpr(Scoring())
.maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
row_of_biggest_in_corner += k;
col_of_biggest_in_corner += k;
RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
if(k==0) biggest = biggest_in_corner;
// if the corner is negligible, then we have less than full rank, and we can finish early

View File

@ -1,3 +1,12 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include <iostream>
#include <cstdint>
#include <cstdlib>
@ -559,8 +568,8 @@ void show_usage_and_exit(int argc, char* argv[],
int main(int argc, char* argv[])
{
cout.precision(3);
cerr.precision(3);
cout.precision(4);
cerr.precision(4);
vector<unique_ptr<action_t>> available_actions;
available_actions.emplace_back(new partition_action_t);

View File

@ -1,11 +1,22 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <fstream>
#include <memory>
bool eigen_use_specific_block_size;
int eigen_block_size_k, eigen_block_size_m, eigen_block_size_n;
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n
@ -82,16 +93,25 @@ struct benchmark_t
{
uint16_t compact_product_size;
uint16_t compact_block_size;
bool use_default_block_size;
float gflops;
benchmark_t()
: compact_product_size(0)
, compact_block_size(0)
, gflops(0)
, use_default_block_size(false)
{}
benchmark_t(size_t pk, size_t pm, size_t pn,
size_t bk, size_t bm, size_t bn)
: compact_product_size(compact_size_triple(pk, pm, pn))
, compact_block_size(compact_size_triple(bk, bm, bn))
, use_default_block_size(false)
, gflops(0)
{}
benchmark_t(size_t pk, size_t pm, size_t pn)
: compact_product_size(compact_size_triple(pk, pm, pn))
, compact_block_size(0)
, use_default_block_size(true)
, gflops(0)
{}
@ -100,10 +120,12 @@ struct benchmark_t
ostream& operator<<(ostream& s, const benchmark_t& b)
{
s << hex;
s << b.compact_product_size
<< " " << b.compact_block_size;
s << dec;
s << hex << b.compact_product_size << dec;
if (b.use_default_block_size) {
s << " default";
} else {
s << " " << hex << b.compact_block_size << dec;
}
s << " " << b.gflops;
return s;
}
@ -121,14 +143,18 @@ bool operator<(const benchmark_t& b1, const benchmark_t& b2)
void benchmark_t::run()
{
// expand our compact benchmark params into proper triples
size_triple_t productsizes(compact_product_size);
size_triple_t blocksizes(compact_block_size);
// feed eigen with our custom blocking params
eigen_block_size_k = blocksizes.k;
eigen_block_size_m = blocksizes.m;
eigen_block_size_n = blocksizes.n;
if (use_default_block_size) {
eigen_use_specific_block_size = false;
} else {
// feed eigen with our custom blocking params
eigen_use_specific_block_size = true;
size_triple_t blocksizes(compact_block_size);
eigen_block_size_k = blocksizes.k;
eigen_block_size_m = blocksizes.m;
eigen_block_size_n = blocksizes.n;
}
// set up the matrix pool
@ -231,9 +257,23 @@ string type_name<double>()
return "double";
}
void show_usage_and_exit(const char *progname)
struct action_t
{
cerr << "usage: " << progname << " [--min-working-set-size=N]" << endl << endl;
virtual const char* invokation_name() const { abort(); return nullptr; }
virtual void run() const { abort(); }
virtual ~action_t() {}
};
void show_usage_and_exit(int argc, char* argv[],
const vector<unique_ptr<action_t>>& available_actions)
{
cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
cerr << "available actions:" << endl << endl;
for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
cerr << " " << (*it)->invokation_name() << endl;
}
cerr << endl;
cerr << "options:" << endl << endl;
cerr << " --min-working-set-size=N:" << endl;
cerr << " Set the minimum working set size to N bytes." << endl;
cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
@ -245,56 +285,8 @@ void show_usage_and_exit(const char *progname)
exit(1);
}
int main(int argc, char* argv[])
void run_benchmarks(vector<benchmark_t>& benchmarks)
{
for (int i = 1; i < argc; i++) {
if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
const char* equals_sign = strchr(argv[i], '=');
min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
} else {
cerr << "unrecognized option: " << argv[i] << endl << endl;
show_usage_and_exit(argv[0]);
}
}
cout.precision(4);
print_cpuinfo();
cout << "benchmark parameters:" << endl;
cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << maxsize << endl;
cout << "measurement_repetitions = " << measurement_repetitions << endl;
cout << "min_accurate_time = " << min_accurate_time << endl;
cout << "min_working_set_size = " << min_working_set_size;
if (min_working_set_size == 0) {
cout << " (try to outsize caches)";
}
cout << endl << endl;
// assemble the array of benchmarks without running them at first
vector<benchmark_t> benchmarks;
for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
benchmark_t b(ksize, msize, nsize, kblock, mblock, nblock);
benchmarks.push_back(b);
}
}
}
}
}
}
}
// randomly shuffling benchmarks allows us to get accurate enough progress info,
// as now the cheap/expensive benchmarks are randomly mixed so they average out.
random_shuffle(benchmarks.begin(), benchmarks.end());
@ -315,14 +307,23 @@ int main(int argc, char* argv[])
if (i > 10) {
cerr << ", ETA ";
float eta = float(time_now - time_start) * (1.0f - ratio_done) / ratio_done;
if (eta > 3600)
cerr << eta/3600 << " hours";
else if (eta > 60)
cerr << eta/60 << " minutes";
else cerr << eta << " seconds";
int eta = int(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done);
int eta_remainder = eta;
if (eta_remainder > 3600) {
int hours = eta_remainder / 3600;
cerr << hours << " h ";
eta_remainder -= hours * 3600;
}
if (eta_remainder > 60) {
int minutes = eta_remainder / 60;
cerr << minutes << " min ";
eta_remainder -= minutes * 60;
}
if (eta < 600 && eta_remainder) {
cerr << eta_remainder << " s";
}
}
cerr << " \r" << flush;
cerr << " \r" << flush;
}
// This is where we actually run a benchmark!
@ -348,9 +349,116 @@ int main(int argc, char* argv[])
}
}
// Output data.
cout << "BEGIN MEASUREMENTS" << endl;
for (auto it = best_benchmarks.begin(); it != best_benchmarks.end(); ++it) {
cout << *it << endl;
}
// keep and return only the best benchmarks
benchmarks = best_benchmarks;
}
struct measure_all_pot_sizes_action_t : action_t
{
virtual const char* invokation_name() const { return "measure-all-pot-sizes"; }
virtual void run() const
{
vector<benchmark_t> benchmarks;
for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
for (size_t kblock = minsize; kblock <= ksize; kblock *= 2) {
for (size_t mblock = minsize; mblock <= msize; mblock *= 2) {
for (size_t nblock = minsize; nblock <= nsize; nblock *= 2) {
benchmarks.emplace_back(ksize, msize, nsize, kblock, mblock, nblock);
}
}
}
}
}
}
}
run_benchmarks(benchmarks);
cout << "BEGIN MEASUREMENTS ALL POT SIZES" << endl;
for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
cout << *it << endl;
}
}
};
struct measure_default_sizes_action_t : action_t
{
virtual const char* invokation_name() const { return "measure-default-sizes"; }
virtual void run() const
{
vector<benchmark_t> benchmarks;
for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
for (size_t msize = minsize; msize <= maxsize; msize *= 2) {
for (size_t nsize = minsize; nsize <= maxsize; nsize *= 2) {
benchmarks.emplace_back(ksize, msize, nsize);
}
}
}
}
run_benchmarks(benchmarks);
cout << "BEGIN MEASUREMENTS DEFAULT SIZES" << endl;
for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
cout << *it << endl;
}
}
};
int main(int argc, char* argv[])
{
cout.precision(4);
cerr.precision(4);
vector<unique_ptr<action_t>> available_actions;
available_actions.emplace_back(new measure_all_pot_sizes_action_t);
available_actions.emplace_back(new measure_default_sizes_action_t);
auto action = available_actions.end();
if (argc <= 1) {
show_usage_and_exit(argc, argv, available_actions);
}
for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
if (!strcmp(argv[1], (*it)->invokation_name())) {
action = it;
break;
}
}
if (action == available_actions.end()) {
show_usage_and_exit(argc, argv, available_actions);
}
for (int i = 2; i < argc; i++) {
if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
const char* equals_sign = strchr(argv[i], '=');
min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
} else {
cerr << "unrecognized option: " << argv[i] << endl << endl;
show_usage_and_exit(argc, argv, available_actions);
}
}
print_cpuinfo();
cout << "benchmark parameters:" << endl;
cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
cout << "scalar type: " << type_name<MatrixType::Scalar>() << endl;
cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
cout << "minsize = " << minsize << endl;
cout << "maxsize = " << maxsize << endl;
cout << "measurement_repetitions = " << measurement_repetitions << endl;
cout << "min_accurate_time = " << min_accurate_time << endl;
cout << "min_working_set_size = " << min_working_set_size;
if (min_working_set_size == 0) {
cout << " (try to outsize caches)";
}
cout << endl << endl;
(*action)->run();
}

View File

@ -157,6 +157,67 @@ inline adouble abs2(const adouble& x) { return x*x; }
#endif // ADOLCSUPPORT_H
\endcode
This other example adds support for the \c mpq_class type from <a href="https://gmplib.org/">GMP</a>. It shows in particular how to change the way Eigen picks the best pivot during LU factorization. It selects the coefficient with the highest score, where the score is by default the absolute value of a number, but we can define a different score, for instance to prefer pivots with a more compact representation (this is an example, not a recommendation). Note that the scores should always be non-negative and only zero is allowed to have a score of zero. Also, this can interact badly with thresholds for inexact scalar types.
\code
#include <gmpxx.h>
#include <Eigen/Core>
#include <boost/operators.hpp>
namespace Eigen {
template<class> struct NumTraits;
template<> struct NumTraits<mpq_class>
{
typedef mpq_class Real;
typedef mpq_class NonInteger;
typedef mpq_class Nested;
static inline Real epsilon() { return 0; }
static inline Real dummy_precision() { return 0; }
enum {
IsInteger = 0,
IsSigned = 1,
IsComplex = 0,
RequireInitialization = 1,
ReadCost = 6,
AddCost = 150,
MulCost = 100
};
};
namespace internal {
template<>
struct significant_decimals_impl<mpq_class>
{
// Infinite precision when printing
static inline int run() { return 0; }
};
template<> struct scalar_score_coeff_op<mpq_class> {
struct result_type : boost::totally_ordered1<result_type> {
std::size_t len;
result_type(int i = 0) : len(i) {} // Eigen uses Score(0) and Score()
result_type(mpq_class const& q) :
len(mpz_size(q.get_num_mpz_t())+
mpz_size(q.get_den_mpz_t())-1) {}
friend bool operator<(result_type x, result_type y) {
// 0 is the worst possible pivot
if (x.len == 0) return y.len > 0;
if (y.len == 0) return false;
// Prefer a pivot with a small representation
return x.len > y.len;
}
friend bool operator==(result_type x, result_type y) {
// Only used to test if the score is 0
return x.len == y.len;
}
};
result_type operator()(mpq_class const& x) const { return x; }
};
}
}
\endcode
\sa \ref TopicPreprocessorDirectives

View File

@ -260,7 +260,7 @@ static void test_type_casting()
mat1.setRandom();
mat2.setRandom();
mat3 = mat1.template cast<double>();
mat3 = mat1.cast<double>();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
@ -269,7 +269,7 @@ static void test_type_casting()
}
}
mat3 = mat2.template cast<double>();
mat3 = mat2.cast<double>();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {