mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-16 03:13:14 +08:00
* Draft of a eigenvalues solver
(does not support complex and does not re-use the QR decomposition) * Rewrite the cache friendly product to have only one instance per scalar type ! This significantly speeds up compilation time and reduces executable size. The current drawback is that some trivial expressions might be evaluated like conjugate or negate. * Renamed "cache optimal" to "cache friendly" * Added the ability to directly access matrix data of some expressions via: - the stride()/_stride() methods - DirectAccessBit flag (replace ReferencableBit)
This commit is contained in:
parent
dca416cace
commit
45cda6704a
@ -18,9 +18,9 @@
|
|||||||
#ifdef __SSE3__
|
#ifdef __SSE3__
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
//#ifdef __SSSE3__
|
#ifdef __SSSE3__
|
||||||
//#include <tmmintrin.h>
|
#include <tmmintrin.h>
|
||||||
//#endif
|
#endif
|
||||||
#elif (defined __ALTIVEC__)
|
#elif (defined __ALTIVEC__)
|
||||||
#define EIGEN_VECTORIZE
|
#define EIGEN_VECTORIZE
|
||||||
#define EIGEN_VECTORIZE_ALTIVEC
|
#define EIGEN_VECTORIZE_ALTIVEC
|
||||||
@ -69,7 +69,7 @@ namespace Eigen {
|
|||||||
#include "src/Core/CwiseBinaryOp.h"
|
#include "src/Core/CwiseBinaryOp.h"
|
||||||
#include "src/Core/CwiseUnaryOp.h"
|
#include "src/Core/CwiseUnaryOp.h"
|
||||||
#include "src/Core/CwiseNullaryOp.h"
|
#include "src/Core/CwiseNullaryOp.h"
|
||||||
#if (defined EIGEN_WIP_PRODUCT) && (defined EIGEN_VECTORIZE)
|
#if (defined EIGEN_WIP_PRODUCT)
|
||||||
#include "src/Core/ProductWIP.h"
|
#include "src/Core/ProductWIP.h"
|
||||||
#else
|
#else
|
||||||
#include "src/Core/Product.h"
|
#include "src/Core/Product.h"
|
||||||
|
1
Eigen/QR
1
Eigen/QR
@ -6,6 +6,7 @@
|
|||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
#include "Eigen/src/QR/QR.h"
|
#include "Eigen/src/QR/QR.h"
|
||||||
|
#include "Eigen/src/QR/EigenSolver.h"
|
||||||
|
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols> >
|
|||||||
|| (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
|
|| (ColsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime == Dynamic))
|
||||||
? ~LargeBit
|
? ~LargeBit
|
||||||
: ~(unsigned int)0,
|
: ~(unsigned int)0,
|
||||||
Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | ReferencableBit) & FlagsMaskLargeBit,
|
Flags = MatrixType::Flags & (DefaultLostFlagMask | VectorizableBit | DirectAccessBit) & FlagsMaskLargeBit,
|
||||||
CoeffReadCost = MatrixType::CoeffReadCost
|
CoeffReadCost = MatrixType::CoeffReadCost
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -132,6 +132,8 @@ template<typename MatrixType, int BlockRows, int BlockCols> class Block
|
|||||||
int _rows() const { return m_blockRows.value(); }
|
int _rows() const { return m_blockRows.value(); }
|
||||||
int _cols() const { return m_blockCols.value(); }
|
int _cols() const { return m_blockCols.value(); }
|
||||||
|
|
||||||
|
int _stride(void) const { return m_matrix.stride(); }
|
||||||
|
|
||||||
Scalar& _coeffRef(int row, int col)
|
Scalar& _coeffRef(int row, int col)
|
||||||
{
|
{
|
||||||
return m_matrix.const_cast_derived()
|
return m_matrix.const_cast_derived()
|
||||||
|
353
Eigen/src/Core/CacheFriendlyProduct.h
Normal file
353
Eigen/src/Core/CacheFriendlyProduct.h
Normal file
@ -0,0 +1,353 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra. Eigen itself is part of the KDE project.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
||||||
|
#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
||||||
|
|
||||||
|
template<typename Scalar>
|
||||||
|
static void ei_cache_friendly_product(
|
||||||
|
int _rows, int _cols, int depth,
|
||||||
|
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
|
||||||
|
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
|
||||||
|
bool resRowMajor, Scalar* res, int resStride)
|
||||||
|
{
|
||||||
|
const Scalar* __restrict__ lhs;
|
||||||
|
const Scalar* __restrict__ rhs;
|
||||||
|
int lhsStride, rhsStride, rows, cols;
|
||||||
|
bool lhsRowMajor;
|
||||||
|
|
||||||
|
if (resRowMajor)
|
||||||
|
{
|
||||||
|
lhs = _rhs;
|
||||||
|
rhs = _lhs;
|
||||||
|
lhsStride = _rhsStride;
|
||||||
|
rhsStride = _lhsStride;
|
||||||
|
cols = _rows;
|
||||||
|
rows = _cols;
|
||||||
|
lhsRowMajor = _rhsRowMajor;
|
||||||
|
ei_assert(_lhsRowMajor);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lhs = _lhs;
|
||||||
|
rhs = _rhs;
|
||||||
|
lhsStride = _lhsStride;
|
||||||
|
rhsStride = _rhsStride;
|
||||||
|
rows = _rows;
|
||||||
|
cols = _cols;
|
||||||
|
lhsRowMajor = _lhsRowMajor;
|
||||||
|
ei_assert(!_rhsRowMajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||||
|
#if (defined __i386__)
|
||||||
|
// i386 architecture provides only 8 xmm registers,
|
||||||
|
// so let's reduce the max number of rows processed at once.
|
||||||
|
MaxBlockRows = 4,
|
||||||
|
MaxBlockRows_ClampingMask = 0xFFFFFC,
|
||||||
|
#else
|
||||||
|
MaxBlockRows = 8,
|
||||||
|
MaxBlockRows_ClampingMask = 0xFFFFF8,
|
||||||
|
#endif
|
||||||
|
// maximal size of the blocks fitted in L2 cache
|
||||||
|
MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//const bool rhsIsAligned = (PacketSize==1) || (((rhsStride%PacketSize) == 0) && (size_t(rhs)%16==0));
|
||||||
|
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
|
||||||
|
|
||||||
|
const int remainingSize = depth % PacketSize;
|
||||||
|
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
|
||||||
|
const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
|
||||||
|
const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
|
||||||
|
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
|
||||||
|
Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size);
|
||||||
|
Scalar* __restrict__ rhsCopy = (Scalar*)alloca(sizeof(Scalar)*l2BlockSize);
|
||||||
|
|
||||||
|
// loops on each L2 cache friendly blocks of the result
|
||||||
|
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
||||||
|
{
|
||||||
|
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
|
||||||
|
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
|
||||||
|
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
|
||||||
|
//const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
|
||||||
|
|
||||||
|
// build a cache friendly blocky matrix
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
// copy l2blocksize rows of m_lhs to blocks of ps x bw
|
||||||
|
asm("#eigen begin buildblocks");
|
||||||
|
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||||
|
{
|
||||||
|
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||||
|
|
||||||
|
for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
|
||||||
|
{
|
||||||
|
// TODO merge the if l2blockRemainingRows
|
||||||
|
// const int blockRows = std::min(i+MaxBlockRows, rows) - i;
|
||||||
|
|
||||||
|
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
// TODO write these loops using meta unrolling
|
||||||
|
// negligible for large matrices but useful for small ones
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int w=0; w<MaxBlockRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(i+w)*lhsStride + (k+s)];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int w=0; w<MaxBlockRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(i+w) + (k+s)*lhsStride];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (l2blockRemainingRows>0)
|
||||||
|
{
|
||||||
|
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
asm("#eigen end buildblocks");
|
||||||
|
|
||||||
|
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||||
|
{
|
||||||
|
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
|
||||||
|
|
||||||
|
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||||
|
{
|
||||||
|
// acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
|
||||||
|
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||||
|
|
||||||
|
// for each bw x 1 result's block
|
||||||
|
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
|
||||||
|
{
|
||||||
|
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||||
|
{
|
||||||
|
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
|
||||||
|
const Scalar* __restrict__ localB = &block[offsetblock];
|
||||||
|
|
||||||
|
const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]);
|
||||||
|
|
||||||
|
// copy unaligned rhs data
|
||||||
|
// YES it seems to be faster to copy some part of rhs multiple times
|
||||||
|
// to aligned memory rather than using unligned load.
|
||||||
|
// Moreover this avoids a "if" in the most nested loop :)
|
||||||
|
if (PacketSize>1 && size_t(rhsColumn)%16)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
for (int k = l2k; k<l2blockSizeEnd; ++k)
|
||||||
|
{
|
||||||
|
rhsCopy[count++] = rhsColumn[k];
|
||||||
|
}
|
||||||
|
rhsColumn = &(rhsCopy[-l2k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
PacketType dst[MaxBlockRows];
|
||||||
|
dst[0] = ei_pset1(Scalar(0.));
|
||||||
|
dst[1] = dst[0];
|
||||||
|
dst[2] = dst[0];
|
||||||
|
dst[3] = dst[0];
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
dst[4] = dst[0];
|
||||||
|
dst[5] = dst[0];
|
||||||
|
dst[6] = dst[0];
|
||||||
|
dst[7] = dst[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
PacketType tmp;
|
||||||
|
|
||||||
|
asm("#eigen begincore");
|
||||||
|
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
tmp = ei_pload(&rhsColumn[k]);
|
||||||
|
|
||||||
|
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows ])), dst[0]);
|
||||||
|
dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+ PacketSize])), dst[1]);
|
||||||
|
dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+2*PacketSize])), dst[2]);
|
||||||
|
dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+3*PacketSize])), dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+4*PacketSize])), dst[4]);
|
||||||
|
dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+5*PacketSize])), dst[5]);
|
||||||
|
dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+6*PacketSize])), dst[6]);
|
||||||
|
dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*MaxBlockRows+7*PacketSize])), dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Scalar* __restrict__ localRes = &(res[l1i + l1j*resStride]);
|
||||||
|
|
||||||
|
if (PacketSize>1 && resIsAligned)
|
||||||
|
{
|
||||||
|
ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(dst)));
|
||||||
|
if (PacketSize==2)
|
||||||
|
ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
|
||||||
|
if (PacketSize==2)
|
||||||
|
ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
localRes[0] += ei_predux(dst[0]);
|
||||||
|
localRes[1] += ei_predux(dst[1]);
|
||||||
|
localRes[2] += ei_predux(dst[2]);
|
||||||
|
localRes[3] += ei_predux(dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
localRes[4] += ei_predux(dst[4]);
|
||||||
|
localRes[5] += ei_predux(dst[5]);
|
||||||
|
localRes[6] += ei_predux(dst[6]);
|
||||||
|
localRes[7] += ei_predux(dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
asm("#eigen endcore");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (l2blockRemainingRows>0)
|
||||||
|
{
|
||||||
|
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
|
||||||
|
const Scalar* localB = &block[offsetblock];
|
||||||
|
|
||||||
|
asm("#eigen begin dynkernel");
|
||||||
|
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||||
|
{
|
||||||
|
const Scalar* __restrict__ rhsColumn = &(rhs[l1j*rhsStride]);
|
||||||
|
|
||||||
|
// copy unaligned rhs data
|
||||||
|
if (PacketSize>1 && size_t(rhsColumn)%16)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
for (int k = l2k; k<l2blockSizeEnd; ++k)
|
||||||
|
{
|
||||||
|
rhsCopy[count++] = rhsColumn[k];
|
||||||
|
}
|
||||||
|
rhsColumn = &(rhsCopy[-l2k]);
|
||||||
|
}
|
||||||
|
|
||||||
|
PacketType dst[MaxBlockRows];
|
||||||
|
dst[0] = ei_pset1(Scalar(0.));
|
||||||
|
dst[1] = dst[0];
|
||||||
|
dst[2] = dst[0];
|
||||||
|
dst[3] = dst[0];
|
||||||
|
if (MaxBlockRows>4)
|
||||||
|
{
|
||||||
|
dst[4] = dst[0];
|
||||||
|
dst[5] = dst[0];
|
||||||
|
dst[6] = dst[0];
|
||||||
|
dst[7] = dst[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// let's declare a few other temporary registers
|
||||||
|
PacketType tmp;
|
||||||
|
|
||||||
|
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
tmp = ei_pload(&rhsColumn[k]);
|
||||||
|
|
||||||
|
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
|
||||||
|
if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
|
||||||
|
if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
|
||||||
|
if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
|
||||||
|
if (MaxBlockRows>4)
|
||||||
|
{
|
||||||
|
if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
|
||||||
|
if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
|
||||||
|
if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
|
||||||
|
if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Scalar* __restrict__ localRes = &(res[l2blockRowEndBW + l1j*resStride]);
|
||||||
|
|
||||||
|
// process the remaining rows once at a time
|
||||||
|
localRes[0] += ei_predux(dst[0]);
|
||||||
|
if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
|
||||||
|
if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
|
||||||
|
if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
|
||||||
|
if (MaxBlockRows>4)
|
||||||
|
{
|
||||||
|
if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
|
||||||
|
if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
|
||||||
|
if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
|
||||||
|
if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
asm("#eigen end dynkernel");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (PacketSize>1 && remainingSize)
|
||||||
|
{
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int j=0; j<cols; ++j)
|
||||||
|
for (int i=0; i<rows; ++i)
|
||||||
|
{
|
||||||
|
Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
|
||||||
|
for (int k=1; k<remainingSize; ++k)
|
||||||
|
tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
|
||||||
|
res[i+j*resStride] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int j=0; j<cols; ++j)
|
||||||
|
for (int i=0; i<rows; ++i)
|
||||||
|
{
|
||||||
|
Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
|
||||||
|
for (int k=1; k<remainingSize; ++k)
|
||||||
|
tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
|
||||||
|
res[i+j*resStride] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
@ -47,7 +47,7 @@ struct ei_traits<Map<MatrixType> >
|
|||||||
ColsAtCompileTime = MatrixType::ColsAtCompileTime,
|
ColsAtCompileTime = MatrixType::ColsAtCompileTime,
|
||||||
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
|
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
|
||||||
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
|
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
|
||||||
Flags = MatrixType::Flags & (DefaultLostFlagMask | ReferencableBit),
|
Flags = MatrixType::Flags & (DefaultLostFlagMask | DirectAccessBit),
|
||||||
CoeffReadCost = NumTraits<Scalar>::ReadCost
|
CoeffReadCost = NumTraits<Scalar>::ReadCost
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -46,7 +46,7 @@ inline int ei_log(int) { ei_assert(false); return 0; }
|
|||||||
inline int ei_sin(int) { ei_assert(false); return 0; }
|
inline int ei_sin(int) { ei_assert(false); return 0; }
|
||||||
inline int ei_cos(int) { ei_assert(false); return 0; }
|
inline int ei_cos(int) { ei_assert(false); return 0; }
|
||||||
|
|
||||||
#if EIGEN_GNUC_AT_LEAST(4,3)
|
#if EIGEN_GNUC_AT_LEAST(4,2)
|
||||||
inline int ei_pow(int x, int y) { return int(std::pow(double(x), y)); }
|
inline int ei_pow(int x, int y) { return int(std::pow(double(x), y)); }
|
||||||
#else
|
#else
|
||||||
inline int ei_pow(int x, int y) { return std::pow(x, y); }
|
inline int ei_pow(int x, int y) { return std::pow(x, y); }
|
||||||
|
@ -100,6 +100,14 @@ class Matrix : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Flags, _MaxRows,
|
|||||||
int _rows() const { return m_storage.rows(); }
|
int _rows() const { return m_storage.rows(); }
|
||||||
int _cols() const { return m_storage.cols(); }
|
int _cols() const { return m_storage.cols(); }
|
||||||
|
|
||||||
|
int _stride(void) const
|
||||||
|
{
|
||||||
|
if(Flags & RowMajorBit)
|
||||||
|
return m_storage.cols();
|
||||||
|
else
|
||||||
|
return m_storage.rows();
|
||||||
|
}
|
||||||
|
|
||||||
const Scalar& _coeff(int row, int col) const
|
const Scalar& _coeff(int row, int col) const
|
||||||
{
|
{
|
||||||
if(Flags & RowMajorBit)
|
if(Flags & RowMajorBit)
|
||||||
|
@ -185,7 +185,7 @@ template<typename Derived> class MatrixBase
|
|||||||
|
|
||||||
/** Overloaded for optimal product evaluation */
|
/** Overloaded for optimal product evaluation */
|
||||||
template<typename Derived1, typename Derived2>
|
template<typename Derived1, typename Derived2>
|
||||||
Derived& lazyAssign(const Product<Derived1,Derived2,CacheOptimalProduct>& product);
|
Derived& lazyAssign(const Product<Derived1,Derived2,CacheFriendlyProduct>& product);
|
||||||
|
|
||||||
CommaInitializer operator<< (const Scalar& s);
|
CommaInitializer operator<< (const Scalar& s);
|
||||||
|
|
||||||
@ -419,6 +419,13 @@ template<typename Derived> class MatrixBase
|
|||||||
|
|
||||||
const Lazy<Derived> lazy() const;
|
const Lazy<Derived> lazy() const;
|
||||||
const Temporary<Derived> temporary() const;
|
const Temporary<Derived> temporary() const;
|
||||||
|
|
||||||
|
/** \returns number of elements to skip to pass from one row (resp. column) to another
|
||||||
|
* for a row-major (resp. column-major) matrix.
|
||||||
|
* Combined with coeffRef() and the compile times flags, it allows a direct access to the data
|
||||||
|
* of the underlying matrix.
|
||||||
|
*/
|
||||||
|
int stride(void) const { return derived()._stride(); }
|
||||||
//@}
|
//@}
|
||||||
|
|
||||||
/// \name Coefficient-wise operations
|
/// \name Coefficient-wise operations
|
||||||
|
@ -60,6 +60,12 @@ struct ei_product_unroller<Index, 0, Lhs, Rhs>
|
|||||||
static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
|
static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename Lhs, typename Rhs>
|
||||||
|
struct ei_product_unroller<0, Dynamic, Lhs, Rhs>
|
||||||
|
{
|
||||||
|
static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {}
|
||||||
|
};
|
||||||
|
|
||||||
template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
|
template<bool RowMajor, int Index, int Size, typename Lhs, typename Rhs, typename PacketScalar>
|
||||||
struct ei_packet_product_unroller;
|
struct ei_packet_product_unroller;
|
||||||
|
|
||||||
@ -113,6 +119,12 @@ struct ei_packet_product_unroller<false, Index, Dynamic, Lhs, Rhs, PacketScalar>
|
|||||||
static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
|
static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename Lhs, typename Rhs, typename PacketScalar>
|
||||||
|
struct ei_packet_product_unroller<false, 0, Dynamic, Lhs, Rhs, PacketScalar>
|
||||||
|
{
|
||||||
|
static void run(int, int, const Lhs&, const Rhs&, PacketScalar&) {}
|
||||||
|
};
|
||||||
|
|
||||||
template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
|
template<typename Product, bool RowMajor = true> struct ProductPacketCoeffImpl {
|
||||||
inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
|
inline static typename Product::PacketScalar execute(const Product& product, int row, int col)
|
||||||
{ return product._packetCoeffRowMajor(row,col); }
|
{ return product._packetCoeffRowMajor(row,col); }
|
||||||
@ -142,7 +154,7 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
|
|||||||
enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||||
&& Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
&& Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||||
&& (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
|
&& (!( (Lhs::Flags&RowMajorBit) && ((Rhs::Flags&RowMajorBit) ^ RowMajorBit)))
|
||||||
? CacheOptimalProduct : NormalProduct };
|
? CacheFriendlyProduct : NormalProduct };
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, int EvalMode>
|
template<typename Lhs, typename Rhs, int EvalMode>
|
||||||
@ -166,7 +178,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
|
|||||||
_LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
_LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
||||||
_Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
|
_Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 1 : 0,
|
||||||
_RowMajor = (RhsFlags & RowMajorBit)
|
_RowMajor = (RhsFlags & RowMajorBit)
|
||||||
&& (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
|
&& (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
|
||||||
_LostBits = DefaultLostFlagMask & ~(
|
_LostBits = DefaultLostFlagMask & ~(
|
||||||
(_RowMajor ? 0 : RowMajorBit)
|
(_RowMajor ? 0 : RowMajorBit)
|
||||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
||||||
@ -312,7 +324,7 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
|
|||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
template<typename Lhs, typename Rhs>
|
template<typename Lhs, typename Rhs>
|
||||||
Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product)
|
Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product)
|
||||||
{
|
{
|
||||||
product.template _cacheOptimalEval<Derived, Aligned>(derived(),
|
product.template _cacheOptimalEval<Derived, Aligned>(derived(),
|
||||||
#ifdef EIGEN_VECTORIZE
|
#ifdef EIGEN_VECTORIZE
|
||||||
|
@ -26,9 +26,7 @@
|
|||||||
#ifndef EIGEN_PRODUCT_H
|
#ifndef EIGEN_PRODUCT_H
|
||||||
#define EIGEN_PRODUCT_H
|
#define EIGEN_PRODUCT_H
|
||||||
|
|
||||||
#ifndef EIGEN_VECTORIZE
|
#include "CacheFriendlyProduct.h"
|
||||||
#error you must enable vectorization to try this experimental product implementation
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<int Index, int Size, typename Lhs, typename Rhs>
|
template<int Index, int Size, typename Lhs, typename Rhs>
|
||||||
struct ei_product_unroller
|
struct ei_product_unroller
|
||||||
@ -145,7 +143,7 @@ template<typename Lhs, typename Rhs> struct ei_product_eval_mode
|
|||||||
{
|
{
|
||||||
enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
enum{ value = Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||||
&& Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
&& Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
|
||||||
? CacheOptimalProduct : NormalProduct };
|
? CacheFriendlyProduct : NormalProduct };
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T> class ei_product_eval_to_column_major
|
template<typename T> class ei_product_eval_to_column_major
|
||||||
@ -173,7 +171,22 @@ template<typename T, int n=1> struct ei_product_nested_rhs
|
|||||||
typename ei_meta_if<
|
typename ei_meta_if<
|
||||||
(ei_traits<T>::Flags & EvalBeforeNestingBit)
|
(ei_traits<T>::Flags & EvalBeforeNestingBit)
|
||||||
|| (ei_traits<T>::Flags & RowMajorBit)
|
|| (ei_traits<T>::Flags & RowMajorBit)
|
||||||
|| (!(ei_traits<T>::Flags & ReferencableBit))
|
|| (!(ei_traits<T>::Flags & DirectAccessBit))
|
||||||
|
|| (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
|
||||||
|
typename ei_product_eval_to_column_major<T>::type,
|
||||||
|
const T&
|
||||||
|
>::ret
|
||||||
|
>::ret type;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, int n=1> struct ei_product_nested_lhs
|
||||||
|
{
|
||||||
|
typedef typename ei_meta_if<
|
||||||
|
ei_is_temporary<T>::ret && !(ei_traits<T>::Flags & RowMajorBit),
|
||||||
|
T,
|
||||||
|
typename ei_meta_if<
|
||||||
|
(ei_traits<T>::Flags & EvalBeforeNestingBit)
|
||||||
|
|| (!(ei_traits<T>::Flags & DirectAccessBit))
|
||||||
|| (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
|
|| (n+1) * NumTraits<typename ei_traits<T>::Scalar>::ReadCost < (n-1) * T::CoeffReadCost,
|
||||||
typename ei_product_eval_to_column_major<T>::type,
|
typename ei_product_eval_to_column_major<T>::type,
|
||||||
const T&
|
const T&
|
||||||
@ -187,9 +200,12 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
|
|||||||
typedef typename Lhs::Scalar Scalar;
|
typedef typename Lhs::Scalar Scalar;
|
||||||
// the cache friendly product evals lhs once only
|
// the cache friendly product evals lhs once only
|
||||||
// FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
|
// FIXME what to do if we chose to dynamically call the normal product from the cache friendly one for small matrices ?
|
||||||
typedef typename ei_nested<Lhs, EvalMode==CacheOptimalProduct ? 0 : Rhs::ColsAtCompileTime>::type LhsNested;
|
typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
|
||||||
|
typename ei_product_nested_lhs<Rhs,0>::type,
|
||||||
|
typename ei_nested<Lhs,Rhs::ColsAtCompileTime>::type>::ret LhsNested;
|
||||||
|
|
||||||
// NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
|
// NOTE that rhs must be ColumnMajor, so we might need a special nested type calculation
|
||||||
typedef typename ei_meta_if<EvalMode==CacheOptimalProduct,
|
typedef typename ei_meta_if<EvalMode==CacheFriendlyProduct,
|
||||||
typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
|
typename ei_product_nested_rhs<Rhs,Lhs::RowsAtCompileTime>::type,
|
||||||
typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
|
typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
|
||||||
typedef typename ei_unref<LhsNested>::type _LhsNested;
|
typedef typename ei_unref<LhsNested>::type _LhsNested;
|
||||||
@ -209,7 +225,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
|
|||||||
_LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
_LhsVectorizable = (!(LhsFlags & RowMajorBit)) && (LhsFlags & VectorizableBit) && (RowsAtCompileTime % ei_packet_traits<Scalar>::size == 0),
|
||||||
_Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0,
|
_Vectorizable = (_LhsVectorizable || _RhsVectorizable) ? 0 : 0,
|
||||||
_RowMajor = (RhsFlags & RowMajorBit)
|
_RowMajor = (RhsFlags & RowMajorBit)
|
||||||
&& (EvalMode==(int)CacheOptimalProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
|
&& (EvalMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!_LhsVectorizable)),
|
||||||
_LostBits = DefaultLostFlagMask & ~(
|
_LostBits = DefaultLostFlagMask & ~(
|
||||||
(_RowMajor ? 0 : RowMajorBit)
|
(_RowMajor ? 0 : RowMajorBit)
|
||||||
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
| ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
|
||||||
@ -241,19 +257,7 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
|
|||||||
typedef typename ei_traits<Product>::_RhsNested _RhsNested;
|
typedef typename ei_traits<Product>::_RhsNested _RhsNested;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
PacketSize = ei_packet_traits<Scalar>::size,
|
PacketSize = ei_packet_traits<Scalar>::size
|
||||||
#if (defined __i386__)
|
|
||||||
// i386 architectures provides only 8 xmmm register,
|
|
||||||
// so let's reduce the max number of rows processed at once.
|
|
||||||
// NOTE that so far the maximal supported value is 8.
|
|
||||||
MaxBlockRows = 4,
|
|
||||||
MaxBlockRows_ClampingMask = 0xFFFFFC,
|
|
||||||
#else
|
|
||||||
MaxBlockRows = 8,
|
|
||||||
MaxBlockRows_ClampingMask = 0xFFFFF8,
|
|
||||||
#endif
|
|
||||||
// maximal size of the blocks fitted in L2 cache
|
|
||||||
MaxL2BlockSize = EIGEN_TUNE_FOR_L2_CACHE_SIZE / sizeof(Scalar)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Product(const Lhs& lhs, const Rhs& rhs)
|
Product(const Lhs& lhs, const Rhs& rhs)
|
||||||
@ -327,14 +331,8 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** \internal */
|
/** \internal */
|
||||||
template<typename DestDerived, int RhsAlignment, int ResAlignment>
|
template<typename DestDerived, int RhsAlignment>
|
||||||
void _cacheFriendlyEvalImpl(DestDerived& res) const __attribute__ ((noinline));
|
void _cacheFriendlyEvalImpl(DestDerived& res) const EIGEN_DONT_INLINE;
|
||||||
|
|
||||||
/** \internal */
|
|
||||||
template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
|
|
||||||
void _cacheFriendlyEvalKernel(DestDerived& res,
|
|
||||||
int l2i, int l2j, int l2k, int l1i,
|
|
||||||
int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const EIGEN_DONT_INLINE;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
const LhsNested m_lhs;
|
const LhsNested m_lhs;
|
||||||
@ -370,7 +368,7 @@ MatrixBase<Derived>::operator*=(const MatrixBase<OtherDerived> &other)
|
|||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
template<typename Lhs, typename Rhs>
|
template<typename Lhs, typename Rhs>
|
||||||
Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheOptimalProduct>& product)
|
Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,CacheFriendlyProduct>& product)
|
||||||
{
|
{
|
||||||
product._cacheFriendlyEval(derived());
|
product._cacheFriendlyEval(derived());
|
||||||
return derived();
|
return derived();
|
||||||
@ -380,326 +378,16 @@ template<typename Lhs, typename Rhs, int EvalMode>
|
|||||||
template<typename DestDerived>
|
template<typename DestDerived>
|
||||||
void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const
|
void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEval(DestDerived& res) const
|
||||||
{
|
{
|
||||||
const bool rhsIsAligned = (m_lhs.cols()%PacketSize == 0);
|
|
||||||
const bool resIsAligned = ((_rows()%PacketSize) == 0);
|
|
||||||
|
|
||||||
if (rhsIsAligned && resIsAligned)
|
|
||||||
_cacheFriendlyEvalImpl<DestDerived, Aligned, Aligned>(res);
|
|
||||||
else if (rhsIsAligned && (!resIsAligned))
|
|
||||||
_cacheFriendlyEvalImpl<DestDerived, Aligned, UnAligned>(res);
|
|
||||||
else if ((!rhsIsAligned) && resIsAligned)
|
|
||||||
_cacheFriendlyEvalImpl<DestDerived, UnAligned, Aligned>(res);
|
|
||||||
else
|
|
||||||
_cacheFriendlyEvalImpl<DestDerived, UnAligned, UnAligned>(res);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, int EvalMode>
|
|
||||||
template<typename DestDerived, int RhsAlignment, int ResAlignment, int BlockRows>
|
|
||||||
void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalKernel(DestDerived& res,
|
|
||||||
int l2i, int l2j, int l2k, int l1i,
|
|
||||||
int l2blockRowEnd, int l2blockColEnd, int l2blockSizeEnd, const Scalar* block) const
|
|
||||||
{
|
|
||||||
asm("#eigen begin kernel");
|
|
||||||
|
|
||||||
ei_internal_assert(BlockRows<=8);
|
|
||||||
|
|
||||||
// NOTE: sounds like we cannot rely on meta-unrolling to access dst[I] without enforcing GCC
|
|
||||||
// to create the dst's elements in memory, hence killing the performance.
|
|
||||||
|
|
||||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
|
||||||
{
|
|
||||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*BlockRows;
|
|
||||||
const Scalar* localB = &block[offsetblock];
|
|
||||||
|
|
||||||
// int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
|
|
||||||
Scalar* rhsColumn = &(m_rhs.const_cast_derived().coeffRef(0, l1j));
|
|
||||||
|
|
||||||
// don't worry, dst is a set of registers
|
|
||||||
PacketScalar dst[BlockRows];
|
|
||||||
dst[0] = ei_pset1(Scalar(0.));
|
|
||||||
switch(BlockRows)
|
|
||||||
{
|
|
||||||
case 8: dst[7] = dst[0];
|
|
||||||
case 7: dst[6] = dst[0];
|
|
||||||
case 6: dst[5] = dst[0];
|
|
||||||
case 5: dst[4] = dst[0];
|
|
||||||
case 4: dst[3] = dst[0];
|
|
||||||
case 3: dst[2] = dst[0];
|
|
||||||
case 2: dst[1] = dst[0];
|
|
||||||
default: break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// let's declare a few other temporary registers
|
|
||||||
PacketScalar tmp, tmp1;
|
|
||||||
|
|
||||||
// unaligned loads are expensive, therefore let's preload the next element in advance
|
|
||||||
if (RhsAlignment==UnAligned)
|
|
||||||
//tmp1 = ei_ploadu(&m_rhs.data()[l1jsize+l2k]);
|
|
||||||
tmp1 = ei_ploadu(&rhsColumn[l2k]);
|
|
||||||
|
|
||||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
// FIXME if we don't cache l1j*m_lhs.cols() then the performance are poor,
|
|
||||||
// let's directly access to the data
|
|
||||||
//PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
|
|
||||||
if (RhsAlignment==Aligned)
|
|
||||||
{
|
|
||||||
//tmp = ei_pload(&m_rhs.data()[l1jsize + k]);
|
|
||||||
tmp = ei_pload(&rhsColumn[k]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
tmp = tmp1;
|
|
||||||
if (k+PacketSize<l2blockSizeEnd)
|
|
||||||
//tmp1 = ei_ploadu(&m_rhs.data()[l1jsize + k+PacketSize]);
|
|
||||||
tmp1 = ei_ploadu(&rhsColumn[k+PacketSize]);
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows ])), dst[0]);
|
|
||||||
if (BlockRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+ PacketSize])), dst[1]);
|
|
||||||
if (BlockRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+2*PacketSize])), dst[2]);
|
|
||||||
if (BlockRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+3*PacketSize])), dst[3]);
|
|
||||||
if (BlockRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+4*PacketSize])), dst[4]);
|
|
||||||
if (BlockRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+5*PacketSize])), dst[5]);
|
|
||||||
if (BlockRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+6*PacketSize])), dst[6]);
|
|
||||||
if (BlockRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*BlockRows+7*PacketSize])), dst[7]);
|
|
||||||
}
|
|
||||||
|
|
||||||
enum {
|
|
||||||
// Number of rows we can reduce per packet
|
|
||||||
PacketRows = (ResAlignment==Aligned && PacketSize>1) ? (BlockRows / PacketSize) : 0,
|
|
||||||
// First row index from which we have to to do redux once at a time
|
|
||||||
RemainingStart = PacketSize * PacketRows
|
|
||||||
};
|
|
||||||
|
|
||||||
// we have up to 4 packets (for doubles: 8 rows / 2)
|
|
||||||
if (PacketRows>=1)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i, l1j,
|
|
||||||
ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(&(dst[0]))));
|
|
||||||
if (PacketRows>=2)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+PacketSize, l1j,
|
|
||||||
ei_padd(res.template packetCoeff<Aligned>(l1i+PacketSize, l1j), ei_preduxp(&(dst[PacketSize]))));
|
|
||||||
if (PacketRows>=3)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+2*PacketSize, l1j,
|
|
||||||
ei_padd(res.template packetCoeff<Aligned>(l1i+2*PacketSize, l1j), ei_preduxp(&(dst[2*PacketSize]))));
|
|
||||||
if (PacketRows>=4)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+3*PacketSize, l1j,
|
|
||||||
ei_padd(res.template packetCoeff<Aligned>(l1i+3*PacketSize, l1j), ei_preduxp(&(dst[3*PacketSize]))));
|
|
||||||
|
|
||||||
// process the remaining rows one at a time
|
|
||||||
if (RemainingStart<=0 && BlockRows>=1) res.coeffRef(l1i+0, l1j) += ei_predux(dst[0]);
|
|
||||||
if (RemainingStart<=1 && BlockRows>=2) res.coeffRef(l1i+1, l1j) += ei_predux(dst[1]);
|
|
||||||
if (RemainingStart<=2 && BlockRows>=3) res.coeffRef(l1i+2, l1j) += ei_predux(dst[2]);
|
|
||||||
if (RemainingStart<=3 && BlockRows>=4) res.coeffRef(l1i+3, l1j) += ei_predux(dst[3]);
|
|
||||||
if (RemainingStart<=4 && BlockRows>=5) res.coeffRef(l1i+4, l1j) += ei_predux(dst[4]);
|
|
||||||
if (RemainingStart<=5 && BlockRows>=6) res.coeffRef(l1i+5, l1j) += ei_predux(dst[5]);
|
|
||||||
if (RemainingStart<=6 && BlockRows>=7) res.coeffRef(l1i+6, l1j) += ei_predux(dst[6]);
|
|
||||||
if (RemainingStart<=7 && BlockRows>=8) res.coeffRef(l1i+7, l1j) += ei_predux(dst[7]);
|
|
||||||
|
|
||||||
asm("#eigen end kernel");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, int EvalMode>
|
|
||||||
template<typename DestDerived, int RhsAlignment, int ResAlignment>
|
|
||||||
void Product<Lhs,Rhs,EvalMode>::_cacheFriendlyEvalImpl(DestDerived& res) const
|
|
||||||
{
|
|
||||||
// FIXME find a way to optimize: (an_xpr) + (a * b)
|
|
||||||
// then we don't need to clear res and avoid and additional mat-mat sum
|
|
||||||
#ifndef EIGEN_WIP_PRODUCT_DIRTY
|
#ifndef EIGEN_WIP_PRODUCT_DIRTY
|
||||||
// std::cout << "wip product\n";
|
|
||||||
res.setZero();
|
res.setZero();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const int rows = _rows();
|
ei_cache_friendly_product<Scalar>(
|
||||||
const int cols = _cols();
|
_rows(), _cols(), m_lhs.cols(),
|
||||||
const int remainingSize = m_lhs.cols()%PacketSize;
|
_LhsNested::Flags&RowMajorBit, &(m_lhs.const_cast_derived().coeffRef(0,0)), m_lhs.stride(),
|
||||||
const int size = m_lhs.cols() - remainingSize; // third dimension of the product clamped to packet boundaries
|
_RhsNested::Flags&RowMajorBit, &(m_rhs.const_cast_derived().coeffRef(0,0)), m_rhs.stride(),
|
||||||
const int l2BlockRows = MaxL2BlockSize > _rows() ? _rows() : MaxL2BlockSize;
|
Flags&RowMajorBit, &(res.coeffRef(0,0)), res.stride()
|
||||||
const int l2BlockCols = MaxL2BlockSize > _cols() ? _cols() : MaxL2BlockSize;
|
);
|
||||||
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
|
|
||||||
//Scalar* __restrict__ block = new Scalar[l2blocksize*size];;
|
|
||||||
Scalar* __restrict__ block = (Scalar*)alloca(sizeof(Scalar)*l2BlockRows*size);
|
|
||||||
|
|
||||||
// loops on each L2 cache friendly blocks of the result
|
|
||||||
for(int l2i=0; l2i<_rows(); l2i+=l2BlockRows)
|
|
||||||
{
|
|
||||||
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
|
|
||||||
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
|
|
||||||
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
|
|
||||||
|
|
||||||
// build a cache friendly block
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
// copy l2blocksize rows of m_lhs to blocks of ps x bw
|
|
||||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
|
||||||
{
|
|
||||||
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
|
||||||
|
|
||||||
for (int i = l2i; i<l2blockRowEndBW; i+=MaxBlockRows)
|
|
||||||
{
|
|
||||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
// TODO write these two loops using meta unrolling
|
|
||||||
// negligible for large matrices but useful for small ones
|
|
||||||
for (int w=0; w<MaxBlockRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = m_lhs.coeff(i+w,k+s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (l2blockRemainingRows>0)
|
|
||||||
{
|
|
||||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
for (int w=0; w<l2blockRemainingRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = m_lhs.coeff(l2blockRowEndBW+w,k+s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
|
||||||
{
|
|
||||||
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
|
|
||||||
|
|
||||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
|
||||||
{
|
|
||||||
// acumulate a bw rows of lhs time a single column of rhs to a bw x 1 block of res
|
|
||||||
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
|
||||||
|
|
||||||
// for each bw x 1 result's block
|
|
||||||
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
|
|
||||||
{
|
|
||||||
_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, MaxBlockRows>(
|
|
||||||
res, l2i, l2j, l2k, l1i, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
|
|
||||||
#if 0
|
|
||||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
|
||||||
{
|
|
||||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
|
|
||||||
const Scalar* localB = &block[offsetblock];
|
|
||||||
|
|
||||||
int l1jsize = l1j * m_lhs.cols(); //TODO find a better way to optimize address computation ?
|
|
||||||
|
|
||||||
PacketScalar dst[bw];
|
|
||||||
dst[0] = ei_pset1(Scalar(0.));
|
|
||||||
dst[1] = dst[0];
|
|
||||||
dst[2] = dst[0];
|
|
||||||
dst[3] = dst[0];
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
dst[4] = dst[0];
|
|
||||||
dst[5] = dst[0];
|
|
||||||
dst[6] = dst[0];
|
|
||||||
dst[7] = dst[0];
|
|
||||||
}
|
|
||||||
PacketScalar b0, b1, tmp;
|
|
||||||
// TODO in unaligned mode, preload the next element
|
|
||||||
// PacketScalar tmp1 = _mm_load_ps(&m_rhs.derived().data()[l1jsize+l2k]);
|
|
||||||
asm("#eigen begincore");
|
|
||||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
// PacketScalar tmp = m_rhs.template packetCoeff<Aligned>(k, l1j);
|
|
||||||
// TODO make this branching compile time (costly for doubles)
|
|
||||||
if (rhsIsAligned)
|
|
||||||
tmp = ei_pload(&m_rhs.derived().data()[l1jsize + k]);
|
|
||||||
else
|
|
||||||
tmp = ei_ploadu(&m_rhs.derived().data()[l1jsize + k]);
|
|
||||||
|
|
||||||
b0 = ei_pload(&(localB[k*bw]));
|
|
||||||
b1 = ei_pload(&(localB[k*bw+ps]));
|
|
||||||
dst[0] = ei_pmadd(tmp, b0, dst[0]);
|
|
||||||
b0 = ei_pload(&(localB[k*bw+2*ps]));
|
|
||||||
dst[1] = ei_pmadd(tmp, b1, dst[1]);
|
|
||||||
b1 = ei_pload(&(localB[k*bw+3*ps]));
|
|
||||||
dst[2] = ei_pmadd(tmp, b0, dst[2]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
b0 = ei_pload(&(localB[k*bw+4*ps]));
|
|
||||||
dst[3] = ei_pmadd(tmp, b1, dst[3]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
b1 = ei_pload(&(localB[k*bw+5*ps]));
|
|
||||||
dst[4] = ei_pmadd(tmp, b0, dst[4]);
|
|
||||||
b0 = ei_pload(&(localB[k*bw+6*ps]));
|
|
||||||
dst[5] = ei_pmadd(tmp, b1, dst[5]);
|
|
||||||
b1 = ei_pload(&(localB[k*bw+7*ps]));
|
|
||||||
dst[6] = ei_pmadd(tmp, b0, dst[6]);
|
|
||||||
dst[7] = ei_pmadd(tmp, b1, dst[7]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (resIsAligned)
|
|
||||||
{
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i, l1j, ei_padd(res.template packetCoeff<Aligned>(l1i, l1j), ei_preduxp(dst)));
|
|
||||||
if (PacketSize==2)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+2,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+2,l1j), ei_preduxp(&(dst[2]))));
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+4,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+4,l1j), ei_preduxp(&(dst[4]))));
|
|
||||||
if (PacketSize==2)
|
|
||||||
res.template writePacketCoeff<Aligned>(l1i+6,l1j, ei_padd(res.template packetCoeff<Aligned>(l1i+6,l1j), ei_preduxp(&(dst[6]))));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// else
|
|
||||||
// {
|
|
||||||
// // TODO uncommenting this code kill the perf, even though it is never called !!
|
|
||||||
// // this is because dst cannot be a set of registers only
|
|
||||||
// // TODO optimize this loop
|
|
||||||
// // TODO is it better to do one redux at once or packet reduxes + unaligned store ?
|
|
||||||
// for (int w = 0; w<bw; ++w)
|
|
||||||
// res.coeffRef(l1i+w, l1j) += ei_predux(dst[w]);
|
|
||||||
// std::cout << "!\n";
|
|
||||||
// }
|
|
||||||
|
|
||||||
asm("#eigen endcore");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
if (l2blockRemainingRows>0)
|
|
||||||
{
|
|
||||||
// this is an attempt to build an array of kernels, but I did not manage to get it compiles
|
|
||||||
// typedef void (*Kernel)(DestDerived& , int, int, int, int, int, int, int, const Scalar*);
|
|
||||||
// Kernel kernels[8];
|
|
||||||
// kernels[0] = (Kernel)(&Product<Lhs,Rhs,EvalMode>::template _cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>);
|
|
||||||
// kernels[l2blockRemainingRows](res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block);
|
|
||||||
|
|
||||||
switch(l2blockRemainingRows)
|
|
||||||
{
|
|
||||||
case 1:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 1>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 2:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 2>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 3:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 3>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 4:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 4>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 5:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 5>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 6:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 6>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
case 7:_cacheFriendlyEvalKernel<DestDerived, RhsAlignment, ResAlignment, 7>(
|
|
||||||
res, l2i, l2j, l2k, l2blockRowEndBW, l2blockRowEnd, l2blockColEnd, l2blockSizeEnd, block); break;
|
|
||||||
default:
|
|
||||||
ei_internal_assert(false && "internal error"); break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handle the part which cannot be processed by the vectorized path
|
|
||||||
if (remainingSize)
|
|
||||||
{
|
|
||||||
res += Product<
|
|
||||||
Block<typename ei_unconst<_LhsNested>::type,Dynamic,Dynamic>,
|
|
||||||
Block<typename ei_unconst<_RhsNested>::type,Dynamic,Dynamic>,
|
|
||||||
NormalProduct>(
|
|
||||||
m_lhs.block(0,size, _rows(), remainingSize),
|
|
||||||
m_rhs.block(size,0, remainingSize, _cols())).lazy();
|
|
||||||
// res += m_lhs.block(0,size, _rows(), remainingSize)._lazyProduct(m_rhs.block(size,0, remainingSize, _cols()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// delete[] block;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_PRODUCT_H
|
#endif // EIGEN_PRODUCT_H
|
||||||
|
@ -69,6 +69,8 @@ template<typename MatrixType> class Transpose
|
|||||||
int _rows() const { return m_matrix.cols(); }
|
int _rows() const { return m_matrix.cols(); }
|
||||||
int _cols() const { return m_matrix.rows(); }
|
int _cols() const { return m_matrix.rows(); }
|
||||||
|
|
||||||
|
int _stride(void) const { return m_matrix.stride(); }
|
||||||
|
|
||||||
Scalar& _coeffRef(int row, int col)
|
Scalar& _coeffRef(int row, int col)
|
||||||
{
|
{
|
||||||
return m_matrix.const_cast_derived().coeffRef(col, row);
|
return m_matrix.const_cast_derived().coeffRef(col, row);
|
||||||
|
@ -67,7 +67,7 @@ struct ei_traits<Triangular<Mode, MatrixType> >
|
|||||||
ColsAtCompileTime = MatrixType::ColsAtCompileTime,
|
ColsAtCompileTime = MatrixType::ColsAtCompileTime,
|
||||||
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
|
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
|
||||||
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
|
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
|
||||||
Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit)) | Mode,
|
Flags = (_MatrixTypeNested::Flags & ~(VectorizableBit | Like1DArrayBit | DirectAccessBit)) | Mode,
|
||||||
CoeffReadCost = _MatrixTypeNested::CoeffReadCost
|
CoeffReadCost = _MatrixTypeNested::CoeffReadCost
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -43,18 +43,18 @@ const unsigned int NullDiagBit = 0x40; ///< means all diagonal coefficients
|
|||||||
const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1
|
const unsigned int UnitDiagBit = 0x80; ///< means all diagonal coefficients are equal to 1
|
||||||
const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0
|
const unsigned int NullLowerBit = 0x200; ///< means the strictly triangular lower part is 0
|
||||||
const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0
|
const unsigned int NullUpperBit = 0x400; ///< means the strictly triangular upper part is 0
|
||||||
const unsigned int ReferencableBit = 0x800; ///< means the expression is writable through MatrixBase::coeffRef(int,int)
|
const unsigned int DirectAccessBit = 0x800; ///< means the underlying matrix data can be direclty accessed
|
||||||
|
|
||||||
enum { Upper=NullLowerBit, Lower=NullUpperBit };
|
enum { Upper=NullLowerBit, Lower=NullUpperBit };
|
||||||
enum { Aligned=0, UnAligned=1 };
|
enum { Aligned=0, UnAligned=1 };
|
||||||
|
|
||||||
// list of flags that are lost by default
|
// list of flags that are lost by default
|
||||||
const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | ReferencableBit
|
const unsigned int DefaultLostFlagMask = ~(VectorizableBit | Like1DArrayBit | DirectAccessBit
|
||||||
| NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
|
| NullDiagBit | UnitDiagBit | NullLowerBit | NullUpperBit);
|
||||||
|
|
||||||
enum { ConditionalJumpCost = 5 };
|
enum { ConditionalJumpCost = 5 };
|
||||||
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
|
enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
|
||||||
enum DirectionType { Vertical, Horizontal };
|
enum DirectionType { Vertical, Horizontal };
|
||||||
enum ProductEvaluationMode { NormalProduct, CacheOptimalProduct, LazyProduct};
|
enum ProductEvaluationMode { NormalProduct, CacheFriendlyProduct, LazyProduct};
|
||||||
|
|
||||||
#endif // EIGEN_CONSTANTS_H
|
#endif // EIGEN_CONSTANTS_H
|
||||||
|
848
Eigen/src/QR/EigenSolver.h
Normal file
848
Eigen/src/QR/EigenSolver.h
Normal file
@ -0,0 +1,848 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra. Eigen itself is part of the KDE project.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef EIGEN_EIGENSOLVER_H
|
||||||
|
#define EIGEN_EIGENSOLVER_H
|
||||||
|
|
||||||
|
/** \class EigenSolver
|
||||||
|
*
|
||||||
|
* \brief Eigen values/vectors solver
|
||||||
|
*
|
||||||
|
* \param MatrixType the type of the matrix of which we are computing the eigen decomposition
|
||||||
|
*
|
||||||
|
* \note this code was adapted from JAMA (public domain)
|
||||||
|
*
|
||||||
|
* \sa MatrixBase::eigenvalues()
|
||||||
|
*/
|
||||||
|
template<typename _MatrixType> class EigenSolver
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
typedef _MatrixType MatrixType;
|
||||||
|
typedef typename MatrixType::Scalar Scalar;
|
||||||
|
typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
|
||||||
|
|
||||||
|
EigenSolver(const MatrixType& matrix)
|
||||||
|
: m_eivec(matrix.rows(), matrix.cols()),
|
||||||
|
m_eivalr(matrix.cols()), m_eivali(matrix.cols()),
|
||||||
|
m_H(matrix.rows(), matrix.cols()),
|
||||||
|
m_ort(matrix.cols())
|
||||||
|
{
|
||||||
|
_compute(matrix);
|
||||||
|
}
|
||||||
|
|
||||||
|
MatrixType eigenvectors(void) const { return m_eivec; }
|
||||||
|
|
||||||
|
VectorType eigenvalues(void) const { return m_eivalr; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
void _compute(const MatrixType& matrix);
|
||||||
|
|
||||||
|
void tridiagonalization(void);
|
||||||
|
void tql2(void);
|
||||||
|
|
||||||
|
void orthes(void);
|
||||||
|
void hqr2(void);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
MatrixType m_eivec;
|
||||||
|
VectorType m_eivalr, m_eivali;
|
||||||
|
MatrixType m_H;
|
||||||
|
VectorType m_ort;
|
||||||
|
bool m_isSymmetric;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename MatrixType>
|
||||||
|
void EigenSolver<MatrixType>::_compute(const MatrixType& matrix)
|
||||||
|
{
|
||||||
|
assert(matrix.cols() == matrix.rows());
|
||||||
|
|
||||||
|
m_isSymmetric = true;
|
||||||
|
int n = matrix.cols();
|
||||||
|
for (int j = 0; (j < n) && m_isSymmetric; j++) {
|
||||||
|
for (int i = 0; (i < j) && m_isSymmetric; i++) {
|
||||||
|
m_isSymmetric = (matrix(i,j) == matrix(j,i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_eivalr.resize(n,1);
|
||||||
|
m_eivali.resize(n,1);
|
||||||
|
|
||||||
|
if (m_isSymmetric)
|
||||||
|
{
|
||||||
|
m_eivec = matrix;
|
||||||
|
|
||||||
|
// Tridiagonalize.
|
||||||
|
tridiagonalization();
|
||||||
|
|
||||||
|
// Diagonalize.
|
||||||
|
tql2();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_H = matrix;
|
||||||
|
m_ort.resize(n, 1);
|
||||||
|
|
||||||
|
// Reduce to Hessenberg form.
|
||||||
|
orthes();
|
||||||
|
|
||||||
|
// Reduce Hessenberg to real Schur form.
|
||||||
|
hqr2();
|
||||||
|
}
|
||||||
|
std::cout << m_eivali.transpose() << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Symmetric Householder reduction to tridiagonal form.
|
||||||
|
template<typename MatrixType>
|
||||||
|
void EigenSolver<MatrixType>::tridiagonalization(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
// This is derived from the Algol procedures tred2 by
|
||||||
|
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||||
|
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||||
|
// Fortran subroutine in EISPACK.
|
||||||
|
|
||||||
|
int n = m_eivec.cols();
|
||||||
|
m_eivalr = m_eivec.row(m_eivalr.size()-1);
|
||||||
|
|
||||||
|
// Householder reduction to tridiagonal form.
|
||||||
|
for (int i = n-1; i > 0; i--)
|
||||||
|
{
|
||||||
|
// Scale to avoid under/overflow.
|
||||||
|
Scalar scale = 0.0;
|
||||||
|
Scalar h = 0.0;
|
||||||
|
scale = m_eivalr.start(i).cwiseAbs().sum();
|
||||||
|
|
||||||
|
if (scale == 0.0)
|
||||||
|
{
|
||||||
|
m_eivali[i] = m_eivalr[i-1];
|
||||||
|
m_eivalr.start(i) = m_eivec.row(i-1).start(i);
|
||||||
|
m_eivec.corner(TopLeft, i, i) = m_eivec.corner(TopLeft, i, i).diagonal().asDiagonal();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Generate Householder vector.
|
||||||
|
m_eivalr.start(i) /= scale;
|
||||||
|
h = m_eivalr.start(i).cwiseAbs2().sum();
|
||||||
|
|
||||||
|
Scalar f = m_eivalr[i-1];
|
||||||
|
Scalar g = ei_sqrt(h);
|
||||||
|
if (f > 0)
|
||||||
|
g = -g;
|
||||||
|
m_eivali[i] = scale * g;
|
||||||
|
h = h - f * g;
|
||||||
|
m_eivalr[i-1] = f - g;
|
||||||
|
m_eivali.start(i).setZero();
|
||||||
|
|
||||||
|
// Apply similarity transformation to remaining columns.
|
||||||
|
for (int j = 0; j < i; j++)
|
||||||
|
{
|
||||||
|
f = m_eivalr[j];
|
||||||
|
m_eivec(j,i) = f;
|
||||||
|
g = m_eivali[j] + m_eivec(j,j) * f;
|
||||||
|
int bSize = i-j-1;
|
||||||
|
if (bSize>0)
|
||||||
|
{
|
||||||
|
g += (m_eivec.col(j).block(j+1, bSize).transpose() * m_eivalr.block(j+1, bSize))(0,0);
|
||||||
|
m_eivali.block(j+1, bSize) += m_eivec.col(j).block(j+1, bSize) * f;
|
||||||
|
}
|
||||||
|
m_eivali[j] = g;
|
||||||
|
}
|
||||||
|
|
||||||
|
f = (m_eivali.start(i).transpose() * m_eivalr.start(i))(0,0);
|
||||||
|
m_eivali.start(i) = (m_eivali.start(i) - (f / (h + h)) * m_eivalr.start(i))/h;
|
||||||
|
|
||||||
|
m_eivec.corner(TopLeft, i, i).lower() -=
|
||||||
|
( (m_eivali.start(i) * m_eivalr.start(i).transpose()).lazy()
|
||||||
|
+ (m_eivalr.start(i) * m_eivali.start(i).transpose()).lazy());
|
||||||
|
|
||||||
|
m_eivalr.start(i) = m_eivec.row(i-1).start(i);
|
||||||
|
m_eivec.row(i).start(i).setZero();
|
||||||
|
}
|
||||||
|
m_eivalr[i] = h;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulate transformations.
|
||||||
|
for (int i = 0; i < n-1; i++)
|
||||||
|
{
|
||||||
|
m_eivec(n-1,i) = m_eivec(i,i);
|
||||||
|
m_eivec(i,i) = 1.0;
|
||||||
|
Scalar h = m_eivalr[i+1];
|
||||||
|
// FIXME this does not looks very stable ;)
|
||||||
|
if (h != 0.0)
|
||||||
|
{
|
||||||
|
m_eivalr.start(i+1) = m_eivec.col(i+1).start(i+1) / h;
|
||||||
|
m_eivec.corner(TopLeft, i+1, i+1) -= m_eivalr.start(i+1)
|
||||||
|
* ( m_eivec.col(i+1).start(i+1).transpose() * m_eivec.corner(TopLeft, i+1, i+1) );
|
||||||
|
}
|
||||||
|
m_eivec.col(i+1).start(i+1).setZero();
|
||||||
|
}
|
||||||
|
m_eivalr = m_eivec.row(m_eivalr.size()-1);
|
||||||
|
m_eivec.row(m_eivalr.size()-1).setZero();
|
||||||
|
m_eivec(n-1,n-1) = 1.0;
|
||||||
|
m_eivali[0] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Symmetric tridiagonal QL algorithm.
|
||||||
|
template<typename MatrixType>
|
||||||
|
void EigenSolver<MatrixType>::tql2(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
// This is derived from the Algol procedures tql2, by
|
||||||
|
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
|
||||||
|
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
|
||||||
|
// Fortran subroutine in EISPACK.
|
||||||
|
|
||||||
|
int n = m_eivalr.size();
|
||||||
|
|
||||||
|
for (int i = 1; i < n; i++) {
|
||||||
|
m_eivali[i-1] = m_eivali[i];
|
||||||
|
}
|
||||||
|
m_eivali[n-1] = 0.0;
|
||||||
|
|
||||||
|
Scalar f = 0.0;
|
||||||
|
Scalar tst1 = 0.0;
|
||||||
|
Scalar eps = std::pow(2.0,-52.0);
|
||||||
|
for (int l = 0; l < n; l++)
|
||||||
|
{
|
||||||
|
// Find small subdiagonal element
|
||||||
|
tst1 = std::max(tst1,ei_abs(m_eivalr[l]) + ei_abs(m_eivali[l]));
|
||||||
|
int m = l;
|
||||||
|
|
||||||
|
while ( (m < n) && (ei_abs(m_eivali[m]) > eps*tst1) )
|
||||||
|
m++;
|
||||||
|
|
||||||
|
// If m == l, m_eivalr[l] is an eigenvalue,
|
||||||
|
// otherwise, iterate.
|
||||||
|
if (m > l)
|
||||||
|
{
|
||||||
|
int iter = 0;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
iter = iter + 1;
|
||||||
|
|
||||||
|
// Compute implicit shift
|
||||||
|
Scalar g = m_eivalr[l];
|
||||||
|
Scalar p = (m_eivalr[l+1] - g) / (2.0 * m_eivali[l]);
|
||||||
|
Scalar r = hypot(p,1.0);
|
||||||
|
if (p < 0)
|
||||||
|
r = -r;
|
||||||
|
|
||||||
|
m_eivalr[l] = m_eivali[l] / (p + r);
|
||||||
|
m_eivalr[l+1] = m_eivali[l] * (p + r);
|
||||||
|
Scalar dl1 = m_eivalr[l+1];
|
||||||
|
Scalar h = g - m_eivalr[l];
|
||||||
|
if (l+2<n)
|
||||||
|
m_eivalr.end(n-l-2) -= VectorType::constant(n-l-2, h);
|
||||||
|
f = f + h;
|
||||||
|
|
||||||
|
// Implicit QL transformation.
|
||||||
|
p = m_eivalr[m];
|
||||||
|
Scalar c = 1.0;
|
||||||
|
Scalar c2 = c;
|
||||||
|
Scalar c3 = c;
|
||||||
|
Scalar el1 = m_eivali[l+1];
|
||||||
|
Scalar s = 0.0;
|
||||||
|
Scalar s2 = 0.0;
|
||||||
|
for (int i = m-1; i >= l; i--)
|
||||||
|
{
|
||||||
|
c3 = c2;
|
||||||
|
c2 = c;
|
||||||
|
s2 = s;
|
||||||
|
g = c * m_eivali[i];
|
||||||
|
h = c * p;
|
||||||
|
r = hypot(p,m_eivali[i]);
|
||||||
|
m_eivali[i+1] = s * r;
|
||||||
|
s = m_eivali[i] / r;
|
||||||
|
c = p / r;
|
||||||
|
p = c * m_eivalr[i] - s * g;
|
||||||
|
m_eivalr[i+1] = h + s * (c * g + s * m_eivalr[i]);
|
||||||
|
|
||||||
|
// Accumulate transformation.
|
||||||
|
for (int k = 0; k < n; k++)
|
||||||
|
{
|
||||||
|
h = m_eivec(k,i+1);
|
||||||
|
m_eivec(k,i+1) = s * m_eivec(k,i) + c * h;
|
||||||
|
m_eivec(k,i) = c * m_eivec(k,i) - s * h;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p = -s * s2 * c3 * el1 * m_eivali[l] / dl1;
|
||||||
|
m_eivali[l] = s * p;
|
||||||
|
m_eivalr[l] = c * p;
|
||||||
|
|
||||||
|
// Check for convergence.
|
||||||
|
} while (ei_abs(m_eivali[l]) > eps*tst1);
|
||||||
|
}
|
||||||
|
m_eivalr[l] = m_eivalr[l] + f;
|
||||||
|
m_eivali[l] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort eigenvalues and corresponding vectors.
|
||||||
|
// TODO use a better sort algorithm !!
|
||||||
|
for (int i = 0; i < n-1; i++)
|
||||||
|
{
|
||||||
|
int k = i;
|
||||||
|
Scalar minValue = m_eivalr[i];
|
||||||
|
for (int j = i+1; j < n; j++)
|
||||||
|
{
|
||||||
|
if (m_eivalr[j] < minValue)
|
||||||
|
{
|
||||||
|
k = j;
|
||||||
|
minValue = m_eivalr[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (k != i)
|
||||||
|
{
|
||||||
|
std::swap(m_eivalr[i], m_eivalr[k]);
|
||||||
|
m_eivec.col(i).swap(m_eivec.col(k));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Nonsymmetric reduction to Hessenberg form.
|
||||||
|
template<typename MatrixType>
|
||||||
|
void EigenSolver<MatrixType>::orthes(void)
|
||||||
|
{
|
||||||
|
// This is derived from the Algol procedures orthes and ortran,
|
||||||
|
// by Martin and Wilkinson, Handbook for Auto. Comp.,
|
||||||
|
// Vol.ii-Linear Algebra, and the corresponding
|
||||||
|
// Fortran subroutines in EISPACK.
|
||||||
|
|
||||||
|
int n = m_eivec.cols();
|
||||||
|
int low = 0;
|
||||||
|
int high = n-1;
|
||||||
|
|
||||||
|
for (int m = low+1; m <= high-1; m++)
|
||||||
|
{
|
||||||
|
// Scale column.
|
||||||
|
Scalar scale = m_H.block(m, m-1, high-m+1, 1).cwiseAbs().sum();
|
||||||
|
if (scale != 0.0)
|
||||||
|
{
|
||||||
|
// Compute Householder transformation.
|
||||||
|
Scalar h = 0.0;
|
||||||
|
// FIXME could be rewritten, but this one looks better wrt cache
|
||||||
|
for (int i = high; i >= m; i--)
|
||||||
|
{
|
||||||
|
m_ort[i] = m_H(i,m-1)/scale;
|
||||||
|
h += m_ort[i] * m_ort[i];
|
||||||
|
}
|
||||||
|
Scalar g = ei_sqrt(h);
|
||||||
|
if (m_ort[m] > 0)
|
||||||
|
g = -g;
|
||||||
|
h = h - m_ort[m] * g;
|
||||||
|
m_ort[m] = m_ort[m] - g;
|
||||||
|
|
||||||
|
// Apply Householder similarity transformation
|
||||||
|
// H = (I-u*u'/h)*H*(I-u*u')/h)
|
||||||
|
int bSize = high-m+1;
|
||||||
|
m_H.block(m, m, bSize, n-m) -= ((m_ort.block(m, bSize)/h)
|
||||||
|
* (m_ort.block(m, bSize).transpose() * m_H.block(m, m, bSize, n-m)).lazy()).lazy();
|
||||||
|
|
||||||
|
m_H.block(0, m, high+1, bSize) -= ((m_H.block(0, m, high+1, bSize) * m_ort.block(m, bSize)).lazy()
|
||||||
|
* (m_ort.block(m, bSize)/h).transpose()).lazy();
|
||||||
|
|
||||||
|
m_ort[m] = scale*m_ort[m];
|
||||||
|
m_H(m,m-1) = scale*g;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulate transformations (Algol's ortran).
|
||||||
|
m_eivec.setIdentity();
|
||||||
|
|
||||||
|
for (int m = high-1; m >= low+1; m--)
|
||||||
|
{
|
||||||
|
if (m_H(m,m-1) != 0.0)
|
||||||
|
{
|
||||||
|
m_ort.block(m+1, high-m) = m_H.col(m-1).block(m+1, high-m);
|
||||||
|
|
||||||
|
int bSize = high-m+1;
|
||||||
|
m_eivec.block(m, m, bSize, bSize) += ( (m_ort.block(m, bSize) / (m_H(m,m-1) * m_ort[m] ) )
|
||||||
|
* (m_ort.block(m, bSize).transpose() * m_eivec.block(m, m, bSize, bSize)).lazy());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Complex scalar division.
|
||||||
|
template<typename Scalar>
|
||||||
|
std::complex<Scalar> cdiv(Scalar xr, Scalar xi, Scalar yr, Scalar yi)
|
||||||
|
{
|
||||||
|
Scalar r,d;
|
||||||
|
if (ei_abs(yr) > ei_abs(yi))
|
||||||
|
{
|
||||||
|
r = yi/yr;
|
||||||
|
d = yr + r*yi;
|
||||||
|
return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
r = yr/yi;
|
||||||
|
d = yi + r*yr;
|
||||||
|
return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Nonsymmetric reduction from Hessenberg to real Schur form.
|
||||||
|
template<typename MatrixType>
|
||||||
|
void EigenSolver<MatrixType>::hqr2(void)
|
||||||
|
{
|
||||||
|
// This is derived from the Algol procedure hqr2,
|
||||||
|
// by Martin and Wilkinson, Handbook for Auto. Comp.,
|
||||||
|
// Vol.ii-Linear Algebra, and the corresponding
|
||||||
|
// Fortran subroutine in EISPACK.
|
||||||
|
|
||||||
|
// Initialize
|
||||||
|
int nn = m_eivec.cols();
|
||||||
|
int n = nn-1;
|
||||||
|
int low = 0;
|
||||||
|
int high = nn-1;
|
||||||
|
Scalar eps = pow(2.0,-52.0);
|
||||||
|
Scalar exshift = 0.0;
|
||||||
|
Scalar p=0,q=0,r=0,s=0,z=0,t,w,x,y;
|
||||||
|
|
||||||
|
// Store roots isolated by balanc and compute matrix norm
|
||||||
|
// FIXME to be efficient the following would requires a triangular reduxion code
|
||||||
|
// Scalar norm = m_H.upper().cwiseAbs().sum() + m_H.corner(BottomLeft,n,n).diagonal().cwiseAbs().sum();
|
||||||
|
Scalar norm = 0.0;
|
||||||
|
for (int j = 0; j < nn; j++)
|
||||||
|
{
|
||||||
|
// FIXME what's the purpose of the following since the condition is always false
|
||||||
|
if ((j < low) || (j > high))
|
||||||
|
{
|
||||||
|
m_eivalr[j] = m_H(j,j);
|
||||||
|
m_eivali[j] = 0.0;
|
||||||
|
}
|
||||||
|
norm += m_H.col(j).start(std::min(j+1,nn)).cwiseAbs().sum();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Outer loop over eigenvalue index
|
||||||
|
int iter = 0;
|
||||||
|
while (n >= low)
|
||||||
|
{
|
||||||
|
// Look for single small sub-diagonal element
|
||||||
|
int l = n;
|
||||||
|
while (l > low)
|
||||||
|
{
|
||||||
|
s = ei_abs(m_H(l-1,l-1)) + ei_abs(m_H(l,l));
|
||||||
|
if (s == 0.0)
|
||||||
|
s = norm;
|
||||||
|
if (ei_abs(m_H(l,l-1)) < eps * s)
|
||||||
|
break;
|
||||||
|
l--;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for convergence
|
||||||
|
// One root found
|
||||||
|
if (l == n)
|
||||||
|
{
|
||||||
|
m_H(n,n) = m_H(n,n) + exshift;
|
||||||
|
m_eivalr[n] = m_H(n,n);
|
||||||
|
m_eivali[n] = 0.0;
|
||||||
|
n--;
|
||||||
|
iter = 0;
|
||||||
|
}
|
||||||
|
else if (l == n-1) // Two roots found
|
||||||
|
{
|
||||||
|
w = m_H(n,n-1) * m_H(n-1,n);
|
||||||
|
p = (m_H(n-1,n-1) - m_H(n,n)) / 2.0;
|
||||||
|
q = p * p + w;
|
||||||
|
z = ei_sqrt(ei_abs(q));
|
||||||
|
m_H(n,n) = m_H(n,n) + exshift;
|
||||||
|
m_H(n-1,n-1) = m_H(n-1,n-1) + exshift;
|
||||||
|
x = m_H(n,n);
|
||||||
|
|
||||||
|
// Scalar pair
|
||||||
|
if (q >= 0)
|
||||||
|
{
|
||||||
|
if (p >= 0)
|
||||||
|
z = p + z;
|
||||||
|
else
|
||||||
|
z = p - z;
|
||||||
|
|
||||||
|
m_eivalr[n-1] = x + z;
|
||||||
|
m_eivalr[n] = m_eivalr[n-1];
|
||||||
|
if (z != 0.0)
|
||||||
|
m_eivalr[n] = x - w / z;
|
||||||
|
|
||||||
|
m_eivali[n-1] = 0.0;
|
||||||
|
m_eivali[n] = 0.0;
|
||||||
|
x = m_H(n,n-1);
|
||||||
|
s = ei_abs(x) + ei_abs(z);
|
||||||
|
p = x / s;
|
||||||
|
q = z / s;
|
||||||
|
r = ei_sqrt(p * p+q * q);
|
||||||
|
p = p / r;
|
||||||
|
q = q / r;
|
||||||
|
|
||||||
|
// Row modification
|
||||||
|
for (int j = n-1; j < nn; j++)
|
||||||
|
{
|
||||||
|
z = m_H(n-1,j);
|
||||||
|
m_H(n-1,j) = q * z + p * m_H(n,j);
|
||||||
|
m_H(n,j) = q * m_H(n,j) - p * z;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Column modification
|
||||||
|
for (int i = 0; i <= n; i++)
|
||||||
|
{
|
||||||
|
z = m_H(i,n-1);
|
||||||
|
m_H(i,n-1) = q * z + p * m_H(i,n);
|
||||||
|
m_H(i,n) = q * m_H(i,n) - p * z;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulate transformations
|
||||||
|
for (int i = low; i <= high; i++)
|
||||||
|
{
|
||||||
|
z = m_eivec(i,n-1);
|
||||||
|
m_eivec(i,n-1) = q * z + p * m_eivec(i,n);
|
||||||
|
m_eivec(i,n) = q * m_eivec(i,n) - p * z;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else // Complex pair
|
||||||
|
{
|
||||||
|
m_eivalr[n-1] = x + p;
|
||||||
|
m_eivalr[n] = x + p;
|
||||||
|
m_eivali[n-1] = z;
|
||||||
|
m_eivali[n] = -z;
|
||||||
|
}
|
||||||
|
n = n - 2;
|
||||||
|
iter = 0;
|
||||||
|
}
|
||||||
|
else // No convergence yet
|
||||||
|
{
|
||||||
|
// Form shift
|
||||||
|
x = m_H(n,n);
|
||||||
|
y = 0.0;
|
||||||
|
w = 0.0;
|
||||||
|
if (l < n)
|
||||||
|
{
|
||||||
|
y = m_H(n-1,n-1);
|
||||||
|
w = m_H(n,n-1) * m_H(n-1,n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wilkinson's original ad hoc shift
|
||||||
|
if (iter == 10)
|
||||||
|
{
|
||||||
|
exshift += x;
|
||||||
|
for (int i = low; i <= n; i++)
|
||||||
|
m_H(i,i) -= x;
|
||||||
|
s = ei_abs(m_H(n,n-1)) + ei_abs(m_H(n-1,n-2));
|
||||||
|
x = y = 0.75 * s;
|
||||||
|
w = -0.4375 * s * s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// MATLAB's new ad hoc shift
|
||||||
|
if (iter == 30)
|
||||||
|
{
|
||||||
|
s = (y - x) / 2.0;
|
||||||
|
s = s * s + w;
|
||||||
|
if (s > 0)
|
||||||
|
{
|
||||||
|
s = ei_sqrt(s);
|
||||||
|
if (y < x)
|
||||||
|
s = -s;
|
||||||
|
s = x - w / ((y - x) / 2.0 + s);
|
||||||
|
for (int i = low; i <= n; i++)
|
||||||
|
m_H(i,i) -= s;
|
||||||
|
exshift += s;
|
||||||
|
x = y = w = 0.964;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
iter = iter + 1; // (Could check iteration count here.)
|
||||||
|
|
||||||
|
// Look for two consecutive small sub-diagonal elements
|
||||||
|
int m = n-2;
|
||||||
|
while (m >= l)
|
||||||
|
{
|
||||||
|
z = m_H(m,m);
|
||||||
|
r = x - z;
|
||||||
|
s = y - z;
|
||||||
|
p = (r * s - w) / m_H(m+1,m) + m_H(m,m+1);
|
||||||
|
q = m_H(m+1,m+1) - z - r - s;
|
||||||
|
r = m_H(m+2,m+1);
|
||||||
|
s = ei_abs(p) + ei_abs(q) + ei_abs(r);
|
||||||
|
p = p / s;
|
||||||
|
q = q / s;
|
||||||
|
r = r / s;
|
||||||
|
if (m == l) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ei_abs(m_H(m,m-1)) * (ei_abs(q) + ei_abs(r)) <
|
||||||
|
eps * (ei_abs(p) * (ei_abs(m_H(m-1,m-1)) + ei_abs(z) +
|
||||||
|
ei_abs(m_H(m+1,m+1)))))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
m--;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = m+2; i <= n; i++)
|
||||||
|
{
|
||||||
|
m_H(i,i-2) = 0.0;
|
||||||
|
if (i > m+2)
|
||||||
|
m_H(i,i-3) = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Double QR step involving rows l:n and columns m:n
|
||||||
|
for (int k = m; k <= n-1; k++)
|
||||||
|
{
|
||||||
|
int notlast = (k != n-1);
|
||||||
|
if (k != m) {
|
||||||
|
p = m_H(k,k-1);
|
||||||
|
q = m_H(k+1,k-1);
|
||||||
|
r = (notlast ? m_H(k+2,k-1) : 0.0);
|
||||||
|
x = ei_abs(p) + ei_abs(q) + ei_abs(r);
|
||||||
|
if (x != 0.0)
|
||||||
|
{
|
||||||
|
p = p / x;
|
||||||
|
q = q / x;
|
||||||
|
r = r / x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (x == 0.0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
s = ei_sqrt(p * p + q * q + r * r);
|
||||||
|
|
||||||
|
if (p < 0)
|
||||||
|
s = -s;
|
||||||
|
|
||||||
|
if (s != 0)
|
||||||
|
{
|
||||||
|
if (k != m)
|
||||||
|
m_H(k,k-1) = -s * x;
|
||||||
|
else if (l != m)
|
||||||
|
m_H(k,k-1) = -m_H(k,k-1);
|
||||||
|
|
||||||
|
p = p + s;
|
||||||
|
x = p / s;
|
||||||
|
y = q / s;
|
||||||
|
z = r / s;
|
||||||
|
q = q / p;
|
||||||
|
r = r / p;
|
||||||
|
|
||||||
|
// Row modification
|
||||||
|
for (int j = k; j < nn; j++)
|
||||||
|
{
|
||||||
|
p = m_H(k,j) + q * m_H(k+1,j);
|
||||||
|
if (notlast)
|
||||||
|
{
|
||||||
|
p = p + r * m_H(k+2,j);
|
||||||
|
m_H(k+2,j) = m_H(k+2,j) - p * z;
|
||||||
|
}
|
||||||
|
m_H(k,j) = m_H(k,j) - p * x;
|
||||||
|
m_H(k+1,j) = m_H(k+1,j) - p * y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Column modification
|
||||||
|
for (int i = 0; i <= std::min(n,k+3); i++)
|
||||||
|
{
|
||||||
|
p = x * m_H(i,k) + y * m_H(i,k+1);
|
||||||
|
if (notlast)
|
||||||
|
{
|
||||||
|
p = p + z * m_H(i,k+2);
|
||||||
|
m_H(i,k+2) = m_H(i,k+2) - p * r;
|
||||||
|
}
|
||||||
|
m_H(i,k) = m_H(i,k) - p;
|
||||||
|
m_H(i,k+1) = m_H(i,k+1) - p * q;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulate transformations
|
||||||
|
for (int i = low; i <= high; i++)
|
||||||
|
{
|
||||||
|
p = x * m_eivec(i,k) + y * m_eivec(i,k+1);
|
||||||
|
if (notlast)
|
||||||
|
{
|
||||||
|
p = p + z * m_eivec(i,k+2);
|
||||||
|
m_eivec(i,k+2) = m_eivec(i,k+2) - p * r;
|
||||||
|
}
|
||||||
|
m_eivec(i,k) = m_eivec(i,k) - p;
|
||||||
|
m_eivec(i,k+1) = m_eivec(i,k+1) - p * q;
|
||||||
|
}
|
||||||
|
} // (s != 0)
|
||||||
|
} // k loop
|
||||||
|
} // check convergence
|
||||||
|
} // while (n >= low)
|
||||||
|
|
||||||
|
// Backsubstitute to find vectors of upper triangular form
|
||||||
|
if (norm == 0.0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (n = nn-1; n >= 0; n--)
|
||||||
|
{
|
||||||
|
p = m_eivalr[n];
|
||||||
|
q = m_eivali[n];
|
||||||
|
|
||||||
|
// Scalar vector
|
||||||
|
if (q == 0)
|
||||||
|
{
|
||||||
|
int l = n;
|
||||||
|
m_H(n,n) = 1.0;
|
||||||
|
for (int i = n-1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
w = m_H(i,i) - p;
|
||||||
|
r = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l))(0,0);
|
||||||
|
|
||||||
|
if (m_eivali[i] < 0.0)
|
||||||
|
{
|
||||||
|
z = w;
|
||||||
|
s = r;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
l = i;
|
||||||
|
if (m_eivali[i] == 0.0)
|
||||||
|
{
|
||||||
|
if (w != 0.0)
|
||||||
|
m_H(i,n) = -r / w;
|
||||||
|
else
|
||||||
|
m_H(i,n) = -r / (eps * norm);
|
||||||
|
}
|
||||||
|
else // Solve real equations
|
||||||
|
{
|
||||||
|
x = m_H(i,i+1);
|
||||||
|
y = m_H(i+1,i);
|
||||||
|
q = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i];
|
||||||
|
t = (x * s - z * r) / q;
|
||||||
|
m_H(i,n) = t;
|
||||||
|
if (ei_abs(x) > ei_abs(z))
|
||||||
|
m_H(i+1,n) = (-r - w * t) / x;
|
||||||
|
else
|
||||||
|
m_H(i+1,n) = (-s - y * t) / z;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overflow control
|
||||||
|
t = ei_abs(m_H(i,n));
|
||||||
|
if ((eps * t) * t > 1)
|
||||||
|
m_H.col(n).end(nn-i) /= t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (q < 0) // Complex vector
|
||||||
|
{
|
||||||
|
std::complex<Scalar> cc;
|
||||||
|
int l = n-1;
|
||||||
|
|
||||||
|
// Last vector component imaginary so matrix is triangular
|
||||||
|
if (ei_abs(m_H(n,n-1)) > ei_abs(m_H(n-1,n)))
|
||||||
|
{
|
||||||
|
m_H(n-1,n-1) = q / m_H(n,n-1);
|
||||||
|
m_H(n-1,n) = -(m_H(n,n) - p) / m_H(n,n-1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cc = cdiv<Scalar>(0.0,-m_H(n-1,n),m_H(n-1,n-1)-p,q);
|
||||||
|
m_H(n-1,n-1) = ei_real(cc);
|
||||||
|
m_H(n-1,n) = ei_imag(cc);
|
||||||
|
}
|
||||||
|
m_H(n,n-1) = 0.0;
|
||||||
|
m_H(n,n) = 1.0;
|
||||||
|
for (int i = n-2; i >= 0; i--)
|
||||||
|
{
|
||||||
|
Scalar ra,sa,vr,vi;
|
||||||
|
ra = (m_H.row(i).end(nn-l) * m_H.col(n-1).end(nn-l)).lazy()(0,0);
|
||||||
|
sa = (m_H.row(i).end(nn-l) * m_H.col(n).end(nn-l)).lazy()(0,0);
|
||||||
|
w = m_H(i,i) - p;
|
||||||
|
|
||||||
|
if (m_eivali[i] < 0.0)
|
||||||
|
{
|
||||||
|
z = w;
|
||||||
|
r = ra;
|
||||||
|
s = sa;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
l = i;
|
||||||
|
if (m_eivali[i] == 0)
|
||||||
|
{
|
||||||
|
cc = cdiv(-ra,-sa,w,q);
|
||||||
|
m_H(i,n-1) = ei_real(cc);
|
||||||
|
m_H(i,n) = ei_imag(cc);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Solve complex equations
|
||||||
|
x = m_H(i,i+1);
|
||||||
|
y = m_H(i+1,i);
|
||||||
|
vr = (m_eivalr[i] - p) * (m_eivalr[i] - p) + m_eivali[i] * m_eivali[i] - q * q;
|
||||||
|
vi = (m_eivalr[i] - p) * 2.0 * q;
|
||||||
|
if ((vr == 0.0) && (vi == 0.0))
|
||||||
|
vr = eps * norm * (ei_abs(w) + ei_abs(q) + ei_abs(x) + ei_abs(y) + ei_abs(z));
|
||||||
|
|
||||||
|
cc= cdiv(x*r-z*ra+q*sa,x*s-z*sa-q*ra,vr,vi);
|
||||||
|
m_H(i,n-1) = ei_real(cc);
|
||||||
|
m_H(i,n) = ei_imag(cc);
|
||||||
|
if (ei_abs(x) > (ei_abs(z) + ei_abs(q)))
|
||||||
|
{
|
||||||
|
m_H(i+1,n-1) = (-ra - w * m_H(i,n-1) + q * m_H(i,n)) / x;
|
||||||
|
m_H(i+1,n) = (-sa - w * m_H(i,n) - q * m_H(i,n-1)) / x;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cc = cdiv(-r-y*m_H(i,n-1),-s-y*m_H(i,n),z,q);
|
||||||
|
m_H(i+1,n-1) = ei_real(cc);
|
||||||
|
m_H(i+1,n) = ei_imag(cc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Overflow control
|
||||||
|
t = std::max(ei_abs(m_H(i,n-1)),ei_abs(m_H(i,n)));
|
||||||
|
if ((eps * t) * t > 1)
|
||||||
|
m_H.block(i, n-1, nn-i, 2) /= t;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vectors of isolated roots
|
||||||
|
for (int i = 0; i < nn; i++)
|
||||||
|
{
|
||||||
|
// FIXME again what's the purpose of this test ?
|
||||||
|
// in this algo low==0 and high==nn-1 !!
|
||||||
|
if (i < low || i > high)
|
||||||
|
{
|
||||||
|
m_eivec.row(i).end(nn-i) = m_H.row(i).end(nn-i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Back transformation to get eigenvectors of original matrix
|
||||||
|
int bRows = high-low+1;
|
||||||
|
for (int j = nn-1; j >= low; j--)
|
||||||
|
{
|
||||||
|
int bSize = std::min(j,high)-low+1;
|
||||||
|
m_eivec.col(j).block(low, bRows) = (m_eivec.block(low, low, bRows, bSize) * m_H.col(j).block(low, bSize));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // EIGEN_EIGENSOLVER_H
|
@ -95,10 +95,11 @@ void QR<MatrixType>::_compute(const MatrixType& matrix)
|
|||||||
m_qr(k,k) += 1.0;
|
m_qr(k,k) += 1.0;
|
||||||
|
|
||||||
// apply transformation to remaining columns
|
// apply transformation to remaining columns
|
||||||
for (int j = k+1; j < cols; j++)
|
int remainingCols = cols - k -1;
|
||||||
|
if (remainingCols>0)
|
||||||
{
|
{
|
||||||
Scalar s = -(m_qr.col(k).end(remainingSize).transpose() * m_qr.col(j).end(remainingSize))(0,0) / m_qr(k,k);
|
m_qr.corner(BottomRight, remainingSize, remainingCols) -= (1./m_qr(k,k)) * m_qr.col(k).end(remainingSize)
|
||||||
m_qr.col(j).end(remainingSize) += s * m_qr.col(k).end(remainingSize);
|
* (m_qr.col(k).end(remainingSize).transpose() * m_qr.corner(BottomRight, remainingSize, remainingCols));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m_norms[k] = -nrm;
|
m_norms[k] = -nrm;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user