This commit is contained in:
Mark Borgerding 2010-03-07 23:46:26 -05:00
commit 101cc03176
125 changed files with 4603 additions and 2195 deletions

View File

@ -98,11 +98,27 @@ if(CMAKE_COMPILER_IS_GNUCXX)
message("Enabling SSE4.2 in tests/examples") message("Enabling SSE4.2 in tests/examples")
endif() endif()
option(EIGEN_TEST_ALTIVEC "Enable/Disable altivec in tests/examples" OFF) option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
if(EIGEN_TEST_ALTIVEC) if(EIGEN_TEST_ALTIVEC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
message("Enabling AltiVec in tests/examples") message("Enabling AltiVec in tests/examples")
endif() endif()
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
if(EIGEN_TEST_NEON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8")
message("Enabling NEON in tests/examples")
endif()
check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
if(COMPILER_SUPPORT_OPENMP)
option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
if(EIGEN_TEST_OPENMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
message("Enabling OpenMP in tests/examples")
endif()
endif()
endif(CMAKE_COMPILER_IS_GNUCXX) endif(CMAKE_COMPILER_IS_GNUCXX)
if(MSVC) if(MSVC)
@ -124,7 +140,13 @@ option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in t
if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION) if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
add_definitions(-DEIGEN_DONT_VECTORIZE=1) add_definitions(-DEIGEN_DONT_VECTORIZE=1)
message("Disabling vectorization in tests/examples") message("Disabling vectorization in tests/examples")
endif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION) endif()
option(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT "Disable explicit alignment (hence vectorization) in tests/examples" OFF)
if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
add_definitions(-DEIGEN_DONT_ALIGN=1)
message("Disabling alignment in tests/examples")
endif()
option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF) option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)

View File

@ -2,7 +2,7 @@
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -29,6 +29,26 @@
// first thing Eigen does: prevent MSVC from committing suicide // first thing Eigen does: prevent MSVC from committing suicide
#include "src/Core/util/DisableMSVCWarnings.h" #include "src/Core/util/DisableMSVCWarnings.h"
// then include this file where all our macros are defined. It's really important to do it first because
// it's where we do all the alignment settings (platform detection and honoring the user's will if he
// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
#include "src/Core/util/Macros.h"
// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
#if !EIGEN_ALIGN
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#endif
// disable vectorization on LLVM: it's not yet ready for that.
#ifdef __clang__
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#endif
#ifdef _MSC_VER #ifdef _MSC_VER
#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
#if (_MSC_VER >= 1500) // 2008 or later #if (_MSC_VER >= 1500) // 2008 or later
@ -40,41 +60,53 @@
#endif #endif
#endif #endif
#ifdef __GNUC__
#define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__>=x && __GNUC_MINOR__>=y) || __GNUC__>x)
#else
#define EIGEN_GNUC_AT_LEAST(x,y) 0
#endif
// Remember that usage of defined() in a #define is undefined by the standard // Remember that usage of defined() in a #define is undefined by the standard
#if (defined __SSE2__) && ( (!defined __GNUC__) || EIGEN_GNUC_AT_LEAST(4,2) ) #if (defined __SSE2__) && ( (!defined __GNUC__) || EIGEN_GNUC_AT_LEAST(4,2) )
#define EIGEN_SSE2_BUT_NOT_OLD_GCC #define EIGEN_SSE2_BUT_NOT_OLD_GCC
#endif #endif
#ifdef EIGEN_DONT_ALIGN
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __clang__
#define EIGEN_DONT_VECTORIZE
#endif
#ifndef EIGEN_DONT_VECTORIZE #ifndef EIGEN_DONT_VECTORIZE
#if defined (EIGEN_SSE2_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) #if defined (EIGEN_SSE2_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
// Defines symbols for compile-time detection of which instructions are
// used.
// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
#define EIGEN_VECTORIZE #define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_SSE #define EIGEN_VECTORIZE_SSE
#include <emmintrin.h> #define EIGEN_VECTORIZE_SSE2
#include <xmmintrin.h>
// Detect sse3/ssse3/sse4:
// gcc and icc defines __SSE3__, ..,
// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
// want to force the use of those instructions with msvc.
#ifdef __SSE3__ #ifdef __SSE3__
#include <pmmintrin.h> #define EIGEN_VECTORIZE_SSE3
#endif #endif
#ifdef __SSSE3__ #ifdef __SSSE3__
#include <tmmintrin.h> #define EIGEN_VECTORIZE_SSSE3
#endif #endif
#ifdef __SSE4_1__ #ifdef __SSE4_1__
#include <smmintrin.h> #define EIGEN_VECTORIZE_SSE4_1
#endif #endif
#ifdef __SSE4_2__ #ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2
#endif
// include files
#include <emmintrin.h>
#include <xmmintrin.h>
#ifdef EIGEN_VECTORIZE_SSE3
#include <pmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSSE3
#include <tmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_1
#include <smmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h> #include <nmmintrin.h>
#endif #endif
#elif defined __ALTIVEC__ #elif defined __ALTIVEC__
@ -86,21 +118,39 @@
#undef bool #undef bool
#undef vector #undef vector
#undef pixel #undef pixel
#elif defined __ARM_NEON__
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_NEON
#include <arm_neon.h>
#endif #endif
#endif #endif
#ifdef _OPENMP
#define EIGEN_HAS_OPENMP
#endif
#ifdef EIGEN_HAS_OPENMP
#include <omp.h>
#endif
#include <cerrno>
#include <cstdlib> #include <cstdlib>
#include <cmath> #include <cmath>
#include <complex> #include <complex>
#include <cassert> #include <cassert>
#include <functional> #include <functional>
#include <iostream> #include <iosfwd>
#include <cstring> #include <cstring>
#include <string> #include <string>
#include <limits> #include <limits>
// for min/max: // for min/max:
#include <algorithm> #include <algorithm>
// for outputting debug info
#ifdef EIGEN_DEBUG_ASSIGN
#include<iostream>
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_NO_EXCEPTIONS) #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_NO_EXCEPTIONS)
#define EIGEN_EXCEPTIONS #define EIGEN_EXCEPTIONS
#endif #endif
@ -119,8 +169,31 @@
// defined in bits/termios.h // defined in bits/termios.h
#undef B0 #undef B0
// defined in some GNU standard header
#undef minor
namespace Eigen { namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
#elif defined(EIGEN_VECTORIZE_SSSE3)
return "SSE, SSE2, SSE3, SSSE3";
#elif defined(EIGEN_VECTORIZE_SSE3)
return "SSE, SSE2, SSE3";
#elif defined(EIGEN_VECTORIZE_SSE2)
return "SSE, SSE2";
#elif defined(EIGEN_VECTORIZE_ALTIVEC)
return "AltiVec";
#elif defined(EIGEN_VECTORIZE_NEON)
return "ARM NEON";
#else
return "None";
#endif
}
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
// ensure QNX/QCC support // ensure QNX/QCC support
using std::size_t; using std::size_t;
@ -138,7 +211,6 @@ using std::size_t;
/** The type used to identify a dense storage. */ /** The type used to identify a dense storage. */
struct Dense {}; struct Dense {};
#include "src/Core/util/Macros.h"
#include "src/Core/util/Constants.h" #include "src/Core/util/Constants.h"
#include "src/Core/util/ForwardDeclarations.h" #include "src/Core/util/ForwardDeclarations.h"
#include "src/Core/util/Meta.h" #include "src/Core/util/Meta.h"
@ -155,11 +227,11 @@ struct Dense {};
#include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/MathFunctions.h"
#elif defined EIGEN_VECTORIZE_ALTIVEC #elif defined EIGEN_VECTORIZE_ALTIVEC
#include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/PacketMath.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/PacketMath.h"
#endif #endif
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD #include "src/Core/arch/Default/Settings.h"
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#include "src/Core/Functors.h" #include "src/Core/Functors.h"
#include "src/Core/DenseBase.h" #include "src/Core/DenseBase.h"
@ -188,6 +260,7 @@ struct Dense {};
#include "src/Core/Dot.h" #include "src/Core/Dot.h"
#include "src/Core/StableNorm.h" #include "src/Core/StableNorm.h"
#include "src/Core/MapBase.h" #include "src/Core/MapBase.h"
#include "src/Core/Stride.h"
#include "src/Core/Map.h" #include "src/Core/Map.h"
#include "src/Core/Block.h" #include "src/Core/Block.h"
#include "src/Core/VectorBlock.h" #include "src/Core/VectorBlock.h"
@ -209,6 +282,7 @@ struct Dense {};
#include "src/Core/TriangularMatrix.h" #include "src/Core/TriangularMatrix.h"
#include "src/Core/SelfAdjointView.h" #include "src/Core/SelfAdjointView.h"
#include "src/Core/SolveTriangular.h" #include "src/Core/SolveTriangular.h"
#include "src/Core/products/Parallelizer.h"
#include "src/Core/products/CoeffBasedProduct.h" #include "src/Core/products/CoeffBasedProduct.h"
#include "src/Core/products/GeneralBlockPanelKernel.h" #include "src/Core/products/GeneralBlockPanelKernel.h"
#include "src/Core/products/GeneralMatrixVector.h" #include "src/Core/products/GeneralMatrixVector.h"
@ -222,28 +296,6 @@ struct Dense {};
#include "src/Core/products/TriangularSolverMatrix.h" #include "src/Core/products/TriangularSolverMatrix.h"
#include "src/Core/BandMatrix.h" #include "src/Core/BandMatrix.h"
/** \defgroup Array_Module Array module
* \ingroup Core_Module
* This module provides several handy features to manipulate matrices as simple array of values.
* In addition to listed classes, it defines various methods of the Cwise interface
* (accessible from MatrixBase::cwise()), including:
* - matrix-scalar sum,
* - coeff-wise comparison operators,
* - sin, cos, sqrt, pow, exp, log, square, cube, inverse (reciprocal).
*
* This module also provides various MatrixBase methods, including:
* - boolean reductions: \ref MatrixBase::all() "all", \ref MatrixBase::any() "any", \ref MatrixBase::count() "count",
* - \ref MatrixBase::Random() "random matrix initialization",
* - a \ref MatrixBase::select() "select" function mimicking the trivariate ?: operator,
* - \ref MatrixBase::colwise() "column-wise" and \ref MatrixBase::rowwise() "row-wise" reductions,
* - \ref MatrixBase::reverse() "matrix reverse",
* - \ref MatrixBase::lpNorm() "generic matrix norm".
*
* \code
* #include <Eigen/Core>
* \endcode
*/
#include "src/Array/Functors.h" #include "src/Array/Functors.h"
#include "src/Array/BooleanRedux.h" #include "src/Array/BooleanRedux.h"
#include "src/Array/Select.h" #include "src/Array/Select.h"

View File

@ -26,7 +26,7 @@
#define EIGEN2SUPPORT_H #define EIGEN2SUPPORT_H
#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H)) #if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before any other Eigen header #error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
#endif #endif
#include "src/Core/util/DisableMSVCWarnings.h" #include "src/Core/util/DisableMSVCWarnings.h"
@ -36,6 +36,7 @@ namespace Eigen {
/** \defgroup Eigen2Support_Module Eigen2 support module /** \defgroup Eigen2Support_Module Eigen2 support module
* This module provides a couple of deprecated functions improving the compatibility with Eigen2. * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
* *
* To use it, define EIGEN2_SUPPORT before including any Eigen header
* \code * \code
* #define EIGEN2_SUPPORT * #define EIGEN2_SUPPORT
* \endcode * \endcode
@ -51,4 +52,7 @@ namespace Eigen {
#include "src/Core/util/EnableMSVCWarnings.h" #include "src/Core/util/EnableMSVCWarnings.h"
// Eigen2 used to include iostream
#include<iostream>
#endif // EIGEN2SUPPORT_H #endif // EIGEN2SUPPORT_H

View File

@ -213,6 +213,9 @@ class Array
void swap(ArrayBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other) void swap(ArrayBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
{ this->_swap(other.derived()); } { this->_swap(other.derived()); }
inline int innerStride() const { return 1; }
inline int outerStride() const { return this->innerSize(); }
#ifdef EIGEN_ARRAY_PLUGIN #ifdef EIGEN_ARRAY_PLUGIN
#include EIGEN_ARRAY_PLUGIN #include EIGEN_ARRAY_PLUGIN
#endif #endif

View File

@ -55,7 +55,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
inline int rows() const { return m_expression.rows(); } inline int rows() const { return m_expression.rows(); }
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int outerStride() const { return m_expression.outerStride(); }
inline int innerStride() const { return m_expression.innerStride(); }
inline const CoeffReturnType coeff(int row, int col) const inline const CoeffReturnType coeff(int row, int col) const
{ {
@ -139,7 +140,8 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
inline int rows() const { return m_expression.rows(); } inline int rows() const { return m_expression.rows(); }
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int outerStride() const { return m_expression.outerStride(); }
inline int innerStride() const { return m_expression.innerStride(); }
inline const CoeffReturnType coeff(int row, int col) const inline const CoeffReturnType coeff(int row, int col) const
{ {

View File

@ -81,11 +81,11 @@ template<typename MatrixType, int Direction> class Reverse
typedef typename MatrixType::template MakeBase< Reverse<MatrixType, Direction> >::Type Base; typedef typename MatrixType::template MakeBase< Reverse<MatrixType, Direction> >::Type Base;
EIGEN_DENSE_PUBLIC_INTERFACE(Reverse) EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
using Base::IsRowMajor;
protected: protected:
enum { enum {
PacketSize = ei_packet_traits<Scalar>::size, PacketSize = ei_packet_traits<Scalar>::size,
IsRowMajor = Flags & RowMajorBit,
IsColMajor = !IsRowMajor, IsColMajor = !IsRowMajor,
ReverseRow = (Direction == Vertical) || (Direction == BothDirections), ReverseRow = (Direction == Vertical) || (Direction == BothDirections),
ReverseCol = (Direction == Horizontal) || (Direction == BothDirections), ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),

View File

@ -62,14 +62,21 @@ template<typename _MatrixType> class LDLT
typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType; typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
typedef Matrix<int, 1, MatrixType::RowsAtCompileTime> IntRowVectorType; typedef Matrix<int, 1, MatrixType::RowsAtCompileTime> IntRowVectorType;
/** /** \brief Default Constructor.
* \brief Default Constructor.
* *
* The default constructor is useful in cases in which the user intends to * The default constructor is useful in cases in which the user intends to
* perform decompositions via LDLT::compute(const MatrixType&). * perform decompositions via LDLT::compute(const MatrixType&).
*/ */
LDLT() : m_matrix(), m_p(), m_transpositions(), m_isInitialized(false) {} LDLT() : m_matrix(), m_p(), m_transpositions(), m_isInitialized(false) {}
/** \brief Default Constructor with memory preallocation
*
* Like the default constructor but with preallocation of the internal data
* according to the specified problem \a size.
* \sa LDLT()
*/
LDLT(int size) : m_matrix(size,size), m_p(size), m_transpositions(size), m_isInitialized(false) {}
LDLT(const MatrixType& matrix) LDLT(const MatrixType& matrix)
: m_matrix(matrix.rows(), matrix.cols()), : m_matrix(matrix.rows(), matrix.cols()),
m_p(matrix.rows()), m_p(matrix.rows()),
@ -148,6 +155,8 @@ template<typename _MatrixType> class LDLT
return m_matrix; return m_matrix;
} }
MatrixType reconstructedMatrix() const;
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
@ -175,6 +184,10 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
m_matrix = a; m_matrix = a;
m_p.resize(size);
m_transpositions.resize(size);
m_isInitialized = false;
if (size <= 1) { if (size <= 1) {
m_p.setZero(); m_p.setZero();
m_transpositions.setZero(); m_transpositions.setZero();
@ -202,11 +215,8 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
{ {
// The biggest overall is the point of reference to which further diagonals // The biggest overall is the point of reference to which further diagonals
// are compared; if any diagonal is negligible compared // are compared; if any diagonal is negligible compared
// to the largest overall, the algorithm bails. This cutoff is suggested // to the largest overall, the algorithm bails.
// in "Analysis of the Cholesky Decomposition of a Semi-definite Matrix" by cutoff = ei_abs(NumTraits<Scalar>::epsilon() * biggest_in_corner);
// Nicholas J. Higham. Also see "Accuracy and Stability of Numerical
// Algorithms" page 217, also by Higham.
cutoff = ei_abs(NumTraits<Scalar>::epsilon() * RealScalar(size) * biggest_in_corner);
m_sign = ei_real(m_matrix.diagonal().coeff(index_of_biggest_in_corner)) > 0 ? 1 : -1; m_sign = ei_real(m_matrix.diagonal().coeff(index_of_biggest_in_corner)) > 0 ? 1 : -1;
} }
@ -231,17 +241,9 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
continue; continue;
} }
RealScalar Djj = ei_real(m_matrix.coeff(j,j) - m_matrix.row(j).head(j) RealScalar Djj = ei_real(m_matrix.coeff(j,j) - m_matrix.row(j).head(j).dot(m_matrix.col(j).head(j)));
.dot(m_matrix.col(j).head(j)));
m_matrix.coeffRef(j,j) = Djj; m_matrix.coeffRef(j,j) = Djj;
// Finish early if the matrix is not full rank.
if(ei_abs(Djj) < cutoff)
{
for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
break;
}
int endSize = size - j - 1; int endSize = size - j - 1;
if (endSize > 0) { if (endSize > 0) {
_temporary.tail(endSize).noalias() = m_matrix.block(j+1,0, endSize, j) _temporary.tail(endSize).noalias() = m_matrix.block(j+1,0, endSize, j)
@ -250,9 +252,12 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
m_matrix.row(j).tail(endSize) = m_matrix.row(j).tail(endSize).conjugate() m_matrix.row(j).tail(endSize) = m_matrix.row(j).tail(endSize).conjugate()
- _temporary.tail(endSize).transpose(); - _temporary.tail(endSize).transpose();
if(ei_abs(Djj) > cutoff)
{
m_matrix.col(j).tail(endSize) = m_matrix.row(j).tail(endSize) / Djj; m_matrix.col(j).tail(endSize) = m_matrix.row(j).tail(endSize) / Djj;
} }
} }
}
// Reverse applied swaps to get P matrix. // Reverse applied swaps to get P matrix.
for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k; for(int k = 0; k < size; ++k) m_p.coeffRef(k) = k;
@ -315,6 +320,31 @@ bool LDLT<MatrixType>::solveInPlace(MatrixBase<Derived> &bAndX) const
return true; return true;
} }
/** \returns the matrix represented by the decomposition,
* i.e., it returns the product: P^T L D L^* P.
* This function is provided for debug purpose. */
template<typename MatrixType>
MatrixType LDLT<MatrixType>::reconstructedMatrix() const
{
ei_assert(m_isInitialized && "LDLT is not initialized.");
const int size = m_matrix.rows();
MatrixType res(size,size);
res.setIdentity();
// PI
for(int i = 0; i < size; ++i) res.row(m_transpositions.coeff(i)).swap(res.row(i));
// L^* P
res = matrixL().adjoint() * res;
// D(L^*P)
res = vectorD().asDiagonal() * res;
// L(DL^*P)
res = matrixL() * res;
// P^T (LDL^*P)
for (int i = size-1; i >= 0; --i) res.row(m_transpositions.coeff(i)).swap(res.row(i));
return res;
}
/** \cholesky_module /** \cholesky_module
* \returns the Cholesky decomposition with full pivoting without square root of \c *this * \returns the Cholesky decomposition with full pivoting without square root of \c *this
*/ */

View File

@ -133,6 +133,8 @@ template<typename _MatrixType, int _UpLo> class LLT
return m_matrix; return m_matrix;
} }
MatrixType reconstructedMatrix() const;
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
@ -295,6 +297,16 @@ bool LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
return true; return true;
} }
/** \returns the matrix represented by the decomposition,
* i.e., it returns the product: L L^*.
* This function is provided for debug purpose. */
template<typename MatrixType, int _UpLo>
MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
{
ei_assert(m_isInitialized && "LLT is not initialized.");
return matrixL() * matrixL().adjoint().toDenseMatrix();
}
/** \cholesky_module /** \cholesky_module
* \returns the LLT decomposition of \c *this * \returns the LLT decomposition of \c *this
*/ */

View File

@ -2,7 +2,7 @@
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2007 Michael Olbrich <michael.olbrich@gmx.net> // Copyright (C) 2007 Michael Olbrich <michael.olbrich@gmx.net>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
@ -37,34 +37,35 @@ struct ei_assign_traits
public: public:
enum { enum {
DstIsAligned = Derived::Flags & AlignedBit, DstIsAligned = Derived::Flags & AlignedBit,
DstHasDirectAccess = Derived::Flags & DirectAccessBit,
SrcIsAligned = OtherDerived::Flags & AlignedBit, SrcIsAligned = OtherDerived::Flags & AlignedBit,
SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned JointAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
}; };
private: private:
enum { enum {
InnerSize = int(Derived::Flags)&RowMajorBit InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
? Derived::ColsAtCompileTime : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
: Derived::RowsAtCompileTime, : int(Derived::RowsAtCompileTime),
InnerMaxSize = int(Derived::Flags)&RowMajorBit InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
? Derived::MaxColsAtCompileTime : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
: Derived::MaxRowsAtCompileTime, : int(Derived::MaxRowsAtCompileTime),
MaxSizeAtCompileTime = ei_size_at_compile_time<Derived::MaxColsAtCompileTime,Derived::MaxRowsAtCompileTime>::ret, MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
PacketSize = ei_packet_traits<typename Derived::Scalar>::size PacketSize = ei_packet_traits<typename Derived::Scalar>::size
}; };
enum { enum {
StorageOrdersAgree = (int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit), StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
MightVectorize = StorageOrdersAgree MightVectorize = StorageOrdersAgree
&& (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit), && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0 MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
&& int(DstIsAligned) && int(SrcIsAligned), && int(DstIsAligned) && int(SrcIsAligned),
MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit), MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
MayLinearVectorize = MightVectorize && MayLinearize MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
&& (DstIsAligned || MaxSizeAtCompileTime == Dynamic), && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll, /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */ so it's only good for large enough sizes. */
MaySliceVectorize = MightVectorize && int(InnerMaxSize)>=3*PacketSize MaySliceVectorize = MightVectorize && DstHasDirectAccess && int(InnerMaxSize)>=3*PacketSize
/* slice vectorization can be slow, so we only want it if the slices are big, which is /* slice vectorization can be slow, so we only want it if the slices are big, which is
indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
in a fixed-size matrix */ in a fixed-size matrix */
@ -104,16 +105,18 @@ public:
: int(NoUnrolling) : int(NoUnrolling)
}; };
#ifdef EIGEN_DEBUG_ASSIGN
static void debug() static void debug()
{ {
EIGEN_DEBUG_VAR(DstIsAligned) EIGEN_DEBUG_VAR(DstIsAligned)
EIGEN_DEBUG_VAR(SrcIsAligned) EIGEN_DEBUG_VAR(SrcIsAligned)
EIGEN_DEBUG_VAR(SrcAlignment) EIGEN_DEBUG_VAR(JointAlignment)
EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerSize)
EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(InnerMaxSize)
EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(PacketSize)
EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(StorageOrdersAgree)
EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MightVectorize)
EIGEN_DEBUG_VAR(MayLinearize)
EIGEN_DEBUG_VAR(MayInnerVectorize) EIGEN_DEBUG_VAR(MayInnerVectorize)
EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize)
EIGEN_DEBUG_VAR(MaySliceVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize)
@ -123,6 +126,7 @@ public:
EIGEN_DEBUG_VAR(MayUnrollInner) EIGEN_DEBUG_VAR(MayUnrollInner)
EIGEN_DEBUG_VAR(Unrolling) EIGEN_DEBUG_VAR(Unrolling)
} }
#endif
}; };
/*************************************************************************** /***************************************************************************
@ -137,17 +141,13 @@ template<typename Derived1, typename Derived2, int Index, int Stop>
struct ei_assign_DefaultTraversal_CompleteUnrolling struct ei_assign_DefaultTraversal_CompleteUnrolling
{ {
enum { enum {
row = int(Derived1::Flags)&RowMajorBit outer = Index / Derived1::InnerSizeAtCompileTime,
? Index / int(Derived1::ColsAtCompileTime) inner = Index % Derived1::InnerSizeAtCompileTime
: Index % Derived1::RowsAtCompileTime,
col = int(Derived1::Flags)&RowMajorBit
? Index % int(Derived1::ColsAtCompileTime)
: Index / Derived1::RowsAtCompileTime
}; };
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{ {
dst.copyCoeff(row, col, src); dst.copyCoeffByOuterInner(outer, inner, src);
ei_assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src); ei_assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
} }
}; };
@ -161,13 +161,10 @@ struct ei_assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, St
template<typename Derived1, typename Derived2, int Index, int Stop> template<typename Derived1, typename Derived2, int Index, int Stop>
struct ei_assign_DefaultTraversal_InnerUnrolling struct ei_assign_DefaultTraversal_InnerUnrolling
{ {
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int row_or_col) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int outer)
{ {
const bool rowMajor = int(Derived1::Flags)&RowMajorBit; dst.copyCoeffByOuterInner(outer, Index, src);
const int row = rowMajor ? row_or_col : Index; ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
const int col = rowMajor ? Index : row_or_col;
dst.copyCoeff(row, col, src);
ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, row_or_col);
} }
}; };
@ -205,18 +202,14 @@ template<typename Derived1, typename Derived2, int Index, int Stop>
struct ei_assign_innervec_CompleteUnrolling struct ei_assign_innervec_CompleteUnrolling
{ {
enum { enum {
row = int(Derived1::Flags)&RowMajorBit outer = Index / Derived1::InnerSizeAtCompileTime,
? Index / int(Derived1::ColsAtCompileTime) inner = Index % Derived1::InnerSizeAtCompileTime,
: Index % Derived1::RowsAtCompileTime, JointAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment
col = int(Derived1::Flags)&RowMajorBit
? Index % int(Derived1::ColsAtCompileTime)
: Index / Derived1::RowsAtCompileTime,
SrcAlignment = ei_assign_traits<Derived1,Derived2>::SrcAlignment
}; };
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{ {
dst.template copyPacket<Derived2, Aligned, SrcAlignment>(row, col, src); dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src); Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
} }
@ -231,13 +224,11 @@ struct ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
template<typename Derived1, typename Derived2, int Index, int Stop> template<typename Derived1, typename Derived2, int Index, int Stop>
struct ei_assign_innervec_InnerUnrolling struct ei_assign_innervec_InnerUnrolling
{ {
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int row_or_col) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int outer)
{ {
const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index; dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
ei_assign_innervec_InnerUnrolling<Derived1, Derived2, ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col); Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
} }
}; };
@ -267,14 +258,9 @@ struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
{ {
const int innerSize = dst.innerSize(); const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
for(int j = 0; j < outerSize; ++j) for(int outer = 0; outer < outerSize; ++outer)
for(int i = 0; i < innerSize; ++i) for(int inner = 0; inner < innerSize; ++inner)
{ dst.copyCoeffByOuterInner(outer, inner, src);
if(int(Derived1::Flags)&RowMajorBit)
dst.copyCoeff(j, i, src);
else
dst.copyCoeff(i, j, src);
}
} }
}; };
@ -293,12 +279,10 @@ struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling>
{ {
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{ {
const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
for(int j = 0; j < outerSize; ++j) for(int outer = 0; outer < outerSize; ++outer)
ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, innerSize> ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
::run(dst, src, j); ::run(dst, src, outer);
} }
}; };
@ -339,14 +323,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling>
const int innerSize = dst.innerSize(); const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
for(int j = 0; j < outerSize; ++j) for(int outer = 0; outer < outerSize; ++outer)
for(int i = 0; i < innerSize; i+=packetSize) for(int inner = 0; inner < innerSize; inner+=packetSize)
{ dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
if(int(Derived1::Flags)&RowMajorBit)
dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src);
else
dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src);
}
} }
}; };
@ -365,12 +344,10 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolli
{ {
EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{ {
const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
for(int j = 0; j < outerSize; ++j) for(int outer = 0; outer < outerSize; ++outer)
ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, innerSize> ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
::run(dst, src, j); ::run(dst, src, outer);
} }
}; };
@ -406,7 +383,7 @@ struct ei_unaligned_assign_impl<false>
template<typename Derived1, typename Derived2> template<typename Derived1, typename Derived2>
struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling> struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling>
{ {
inline static void run(Derived1 &dst, const Derived2 &src) EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
{ {
const int size = dst.size(); const int size = dst.size();
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size; const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
@ -418,7 +395,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling
for(int index = alignedStart; index < alignedEnd; index += packetSize) for(int index = alignedStart; index < alignedEnd; index += packetSize)
{ {
dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src); dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::JointAlignment>(index, src);
} }
ei_unaligned_assign_impl<>::run(src,dst,alignedEnd,size); ei_unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
@ -452,40 +429,24 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling>
const int packetAlignedMask = packetSize - 1; const int packetAlignedMask = packetSize - 1;
const int innerSize = dst.innerSize(); const int innerSize = dst.innerSize();
const int outerSize = dst.outerSize(); const int outerSize = dst.outerSize();
const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask; const int alignedStep = (packetSize - dst.outerStride() % packetSize) & packetAlignedMask;
int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0 int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
: ei_first_aligned(&dst.coeffRef(0,0), innerSize); : ei_first_aligned(&dst.coeffRef(0,0), innerSize);
for(int i = 0; i < outerSize; ++i) for(int outer = 0; outer < outerSize; ++outer)
{ {
const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask); const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
// do the non-vectorizable part of the assignment // do the non-vectorizable part of the assignment
for (int index = 0; index<alignedStart ; ++index) for(int inner = 0; inner<alignedStart ; ++inner)
{ dst.copyCoeffByOuterInner(outer, inner, src);
if(Derived1::Flags&RowMajorBit)
dst.copyCoeff(i, index, src);
else
dst.copyCoeff(index, i, src);
}
// do the vectorizable part of the assignment // do the vectorizable part of the assignment
for (int index = alignedStart; index<alignedEnd; index+=packetSize) for(int inner = alignedStart; inner<alignedEnd; inner+=packetSize)
{ dst.template copyPacketByOuterInner<Derived2, Aligned, Unaligned>(outer, inner, src);
if(Derived1::Flags&RowMajorBit)
dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src);
else
dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src);
}
// do the non-vectorizable part of the assignment // do the non-vectorizable part of the assignment
for (int index = alignedEnd; index<innerSize ; ++index) for(int inner = alignedEnd; inner<innerSize ; ++inner)
{ dst.copyCoeffByOuterInner(outer, inner, src);
if(Derived1::Flags&RowMajorBit)
dst.copyCoeff(i, index, src);
else
dst.copyCoeff(index, i, src);
}
alignedStart = std::min<int>((alignedStart+alignedStep)%packetSize, innerSize); alignedStart = std::min<int>((alignedStart+alignedStep)%packetSize, innerSize);
} }

View File

@ -2,7 +2,7 @@
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -100,8 +100,8 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
// The case a 1x1 matrix seems ambiguous, but the result is the same anyway. // The case a 1x1 matrix seems ambiguous, but the result is the same anyway.
m_startRow( (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0), m_startRow( (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0),
m_startCol( (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0), m_startCol( (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
m_blockRows(matrix.rows()), // if it is a row, then m_blockRows has a fixed-size of 1, so no pb to try to overwrite it m_blockRows(BlockRows==1 ? 1 : matrix.rows()),
m_blockCols(matrix.cols()) // same for m_blockCols m_blockCols(BlockCols==1 ? 1 : matrix.cols())
{ {
ei_assert( (i>=0) && ( ei_assert( (i>=0) && (
((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows()) ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows())
@ -112,7 +112,7 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
*/ */
inline Block(const MatrixType& matrix, int startRow, int startCol) inline Block(const MatrixType& matrix, int startRow, int startCol)
: m_matrix(matrix), m_startRow(startRow), m_startCol(startCol), : m_matrix(matrix), m_startRow(startRow), m_startCol(startCol),
m_blockRows(matrix.rows()), m_blockCols(matrix.cols()) m_blockRows(BlockRows), m_blockCols(BlockCols)
{ {
EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows() ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows()
@ -196,8 +196,8 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
#ifdef EIGEN_PARSED_BY_DOXYGEN #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \sa MapBase::data() */ /** \sa MapBase::data() */
inline const Scalar* data() const; inline const Scalar* data() const;
/** \sa MapBase::stride() */ inline int innerStride() const;
inline int stride() const; inline int outerStride() const;
#endif #endif
protected: protected:
@ -260,17 +260,23 @@ class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess>
&& startCol >= 0 && blockCols >= 0 && startCol + blockCols <= matrix.cols()); && startCol >= 0 && blockCols >= 0 && startCol + blockCols <= matrix.cols());
} }
/** \sa MapBase::stride() */ /** \sa MapBase::innerStride() */
inline int stride() const inline int innerStride() const
{ {
return ((!Base::IsVectorAtCompileTime) return RowsAtCompileTime==1 ? m_matrix.colStride()
|| (BlockRows==1 && ((Flags&RowMajorBit)==0)) : ColsAtCompileTime==1 ? m_matrix.rowStride()
|| (BlockCols==1 && ((Flags&RowMajorBit)==RowMajorBit))) : m_matrix.innerStride();
? m_matrix.stride() : 1; }
/** \sa MapBase::outerStride() */
inline int outerStride() const
{
return IsVectorAtCompileTime ? this->size() : m_matrix.outerStride();
} }
#ifndef __SUNPRO_CC #ifndef __SUNPRO_CC
// FIXME sunstudio is not friendly with the above friend... // FIXME sunstudio is not friendly with the above friend...
// META-FIXME there is no 'friend' keyword around here. Is this obsolete?
protected: protected:
#endif #endif

View File

@ -25,6 +25,24 @@
#ifndef EIGEN_COEFFS_H #ifndef EIGEN_COEFFS_H
#define EIGEN_COEFFS_H #define EIGEN_COEFFS_H
template<typename Derived>
EIGEN_STRONG_INLINE int DenseBase<Derived>::rowIndexByOuterInner(int outer, int inner)
{
return int(Derived::RowsAtCompileTime) == 1 ? 0
: int(Derived::ColsAtCompileTime) == 1 ? inner
: int(Derived::Flags)&RowMajorBit ? outer
: inner;
}
template<typename Derived>
EIGEN_STRONG_INLINE int DenseBase<Derived>::colIndexByOuterInner(int outer, int inner)
{
return int(Derived::ColsAtCompileTime) == 1 ? 0
: int(Derived::RowsAtCompileTime) == 1 ? inner
: int(Derived::Flags)&RowMajorBit ? inner
: outer;
}
/** Short version: don't use this function, use /** Short version: don't use this function, use
* \link operator()(int,int) const \endlink instead. * \link operator()(int,int) const \endlink instead.
* *
@ -48,6 +66,14 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase
return derived().coeff(row, col); return derived().coeff(row, col);
} }
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
::coeffByOuterInner(int outer, int inner) const
{
return coeff(rowIndexByOuterInner(outer, inner),
colIndexByOuterInner(outer, inner));
}
/** \returns the coefficient at given the given row and column. /** \returns the coefficient at given the given row and column.
* *
* \sa operator()(int,int), operator[](int) const * \sa operator()(int,int), operator[](int) const
@ -84,6 +110,14 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
return derived().coeffRef(row, col); return derived().coeffRef(row, col);
} }
template<typename Derived>
EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
::coeffRefByOuterInner(int outer, int inner)
{
return coeffRef(rowIndexByOuterInner(outer, inner),
colIndexByOuterInner(outer, inner));
}
/** \returns a reference to the coefficient at given the given row and column. /** \returns a reference to the coefficient at given the given row and column.
* *
* \sa operator()(int,int) const, operator[](int) * \sa operator()(int,int) const, operator[](int)
@ -205,42 +239,42 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
/** equivalent to operator[](0). */ /** equivalent to operator[](0). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived> EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
::x() const { return (*this)[0]; } ::x() const { return (*this)[0]; }
/** equivalent to operator[](1). */ /** equivalent to operator[](1). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived> EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
::y() const { return (*this)[1]; } ::y() const { return (*this)[1]; }
/** equivalent to operator[](2). */ /** equivalent to operator[](2). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived> EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
::z() const { return (*this)[2]; } ::z() const { return (*this)[2]; }
/** equivalent to operator[](3). */ /** equivalent to operator[](3). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived> EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
::w() const { return (*this)[3]; } ::w() const { return (*this)[3]; }
/** equivalent to operator[](0). */ /** equivalent to operator[](0). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived> EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
::x() { return (*this)[0]; } ::x() { return (*this)[0]; }
/** equivalent to operator[](1). */ /** equivalent to operator[](1). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived> EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
::y() { return (*this)[1]; } ::y() { return (*this)[1]; }
/** equivalent to operator[](2). */ /** equivalent to operator[](2). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived> EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
::z() { return (*this)[2]; } ::z() { return (*this)[2]; }
/** equivalent to operator[](3). */ /** equivalent to operator[](3). */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived> EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
::w() { return (*this)[3]; } ::w() { return (*this)[3]; }
/** \returns the packet of coefficients starting at the given row and column. It is your responsibility /** \returns the packet of coefficients starting at the given row and column. It is your responsibility
@ -261,6 +295,15 @@ DenseBase<Derived>::packet(int row, int col) const
return derived().template packet<LoadMode>(row,col); return derived().template packet<LoadMode>(row,col);
} }
template<typename Derived>
template<int LoadMode>
EIGEN_STRONG_INLINE typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
DenseBase<Derived>::packetByOuterInner(int outer, int inner) const
{
return packet<LoadMode>(rowIndexByOuterInner(outer, inner),
colIndexByOuterInner(outer, inner));
}
/** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility /** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the * to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit. * PacketAccessBit.
@ -279,6 +322,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::writePacket
derived().template writePacket<StoreMode>(row,col,x); derived().template writePacket<StoreMode>(row,col,x);
} }
template<typename Derived>
template<int StoreMode>
EIGEN_STRONG_INLINE void DenseBase<Derived>::writePacketByOuterInner
(int outer, int inner, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
{
writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
colIndexByOuterInner(outer, inner),
x);
}
/** \returns the packet of coefficients starting at the given index. It is your responsibility /** \returns the packet of coefficients starting at the given index. It is your responsibility
* to ensure that a packet really starts there. This method is only available on expressions having the * to ensure that a packet really starts there. This method is only available on expressions having the
* PacketAccessBit and the LinearAccessBit. * PacketAccessBit and the LinearAccessBit.
@ -346,6 +399,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::copyCoeff(int index, const DenseBas
derived().coeffRef(index) = other.derived().coeff(index); derived().coeffRef(index) = other.derived().coeff(index);
} }
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE void DenseBase<Derived>::copyCoeffByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other)
{
const int row = Derived::rowIndexByOuterInner(outer,inner);
const int col = Derived::colIndexByOuterInner(outer,inner);
// derived() is important here: copyCoeff() may be reimplemented in Derived!
derived().copyCoeff(row, col, other);
}
/** \internal Copies the packet at position (row,col) of other into *this. /** \internal Copies the packet at position (row,col) of other into *this.
* *
* This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
@ -379,6 +442,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::copyPacket(int index, const DenseBa
other.derived().template packet<LoadMode>(index)); other.derived().template packet<LoadMode>(index));
} }
template<typename Derived>
template<typename OtherDerived, int StoreMode, int LoadMode>
EIGEN_STRONG_INLINE void DenseBase<Derived>::copyPacketByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other)
{
const int row = Derived::rowIndexByOuterInner(outer,inner);
const int col = Derived::colIndexByOuterInner(outer,inner);
// derived() is important here: copyCoeff() may be reimplemented in Derived!
derived().copyPacket<OtherDerived, StoreMode, LoadMode>(row, col, other);
}
template<typename Derived, bool JustReturnZero> template<typename Derived, bool JustReturnZero>
struct ei_first_aligned_impl struct ei_first_aligned_impl
{ {

View File

@ -124,7 +124,14 @@ template<typename Derived> class DenseBase
* constructed from this one. See the \ref flags "list of flags". * constructed from this one. See the \ref flags "list of flags".
*/ */
IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression is row major. */ IsRowMajor = RowsAtCompileTime==1 ? 1
: ColsAtCompileTime==1 ? 0
: int(Flags) & RowMajorBit, /**< True if this expression has row-major effective addressing.
For non-vectors, it is like reading the RowMajorBit on the Flags. For vectors, this is
overriden by the convention that row-vectors are row-major and column-vectors are column-major. */
InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? SizeAtCompileTime
: int(Flags)&RowMajorBit ? ColsAtCompileTime : RowsAtCompileTime,
CoeffReadCost = ei_traits<Derived>::CoeffReadCost, CoeffReadCost = ei_traits<Derived>::CoeffReadCost,
/**< This is a rough measure of how expensive it is to read one coefficient from /**< This is a rough measure of how expensive it is to read one coefficient from
@ -160,31 +167,98 @@ template<typename Derived> class DenseBase
* In other words, this function returns * In other words, this function returns
* \code rows()==1 || cols()==1 \endcode * \code rows()==1 || cols()==1 \endcode
* \sa rows(), cols(), IsVectorAtCompileTime. */ * \sa rows(), cols(), IsVectorAtCompileTime. */
inline bool isVector() const { return rows()==1 || cols()==1; }
/** \returns the size of the storage major dimension,
* i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
int outerSize() const { return (int(Flags)&RowMajorBit) ? this->rows() : this->cols(); }
/** \returns the size of the inner dimension according to the storage order,
* i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
int innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }
/** Only plain matrices, not expressions may be resized; therefore the only useful resize method is /** \returns the outer size.
* Matrix::resize(). The present method only asserts that the new size equals the old size, and does *
* \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
* with respect to the storage order, i.e., the number of columns for a column-major matrix,
* and the number of rows for a row-major matrix. */
int outerSize() const
{
return IsVectorAtCompileTime ? 1
: (int(Flags)&RowMajorBit) ? this->rows() : this->cols();
}
/** \returns the inner size.
*
* \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
* with respect to the storage order, i.e., the number of rows for a column-major matrix,
* and the number of columns for a row-major matrix. */
int innerSize() const
{
return IsVectorAtCompileTime ? this->size()
: (int(Flags)&RowMajorBit) ? this->cols() : this->rows();
}
/** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
* Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
* nothing else. * nothing else.
*/ */
void resize(int size) void resize(int size)
{ {
ei_assert(size == this->size() ei_assert(size == this->size()
&& "MatrixBase::resize() does not actually allow to resize."); && "DenseBase::resize() does not actually allow to resize.");
} }
/** Only plain matrices, not expressions may be resized; therefore the only useful resize method is /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
* Matrix::resize(). The present method only asserts that the new size equals the old size, and does * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
* nothing else. * nothing else.
*/ */
void resize(int rows, int cols) void resize(int rows, int cols)
{ {
ei_assert(rows == this->rows() && cols == this->cols() ei_assert(rows == this->rows() && cols == this->cols()
&& "MatrixBase::resize() does not actually allow to resize."); && "DenseBase::resize() does not actually allow to resize.");
}
/** \returns the pointer increment between two consecutive elements.
*
* \note For vectors, the storage order is ignored. For matrices (non-vectors), we're looking
* at the increment between two consecutive elements within a slice in the inner direction.
*
* \sa outerStride(), rowStride(), colStride()
*/
inline int innerStride() const
{
EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
return derived().innerStride();
}
/** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
* in a column-major matrix).
*
* \note For vectors, the storage order is ignored, there is only one inner slice, and so this method returns 1.
* For matrices (non-vectors), the notion of inner slice depends on the storage order.
*
* \sa innerStride(), rowStride(), colStride()
*/
inline int outerStride() const
{
EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
return derived().outerStride();
}
inline int stride() const
{
return IsVectorAtCompileTime ? innerStride() : outerStride();
}
/** \returns the pointer increment between two consecutive rows.
*
* \sa innerStride(), outerStride(), colStride()
*/
inline int rowStride() const
{
return IsRowMajor ? outerStride() : innerStride();
}
/** \returns the pointer increment between two consecutive columns.
*
* \sa innerStride(), outerStride(), rowStride()
*/
inline int colStride() const
{
return IsRowMajor ? innerStride() : outerStride();
} }
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
@ -206,6 +280,15 @@ template<typename Derived> class DenseBase
typedef Block<Derived, 1, ei_traits<Derived>::ColsAtCompileTime> RowXpr; typedef Block<Derived, 1, ei_traits<Derived>::ColsAtCompileTime> RowXpr;
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
const CoeffReturnType x() const;
const CoeffReturnType y() const;
const CoeffReturnType z() const;
const CoeffReturnType w() const;
Scalar& x();
Scalar& y();
Scalar& z();
Scalar& w();
/** Copies \a other into *this. \returns a reference to *this. */ /** Copies \a other into *this. \returns a reference to *this. */
template<typename OtherDerived> template<typename OtherDerived>
Derived& operator=(const DenseBase<OtherDerived>& other); Derived& operator=(const DenseBase<OtherDerived>& other);
@ -242,9 +325,11 @@ template<typename Derived> class DenseBase
CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other); CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
const CoeffReturnType coeff(int row, int col) const; const CoeffReturnType coeff(int row, int col) const;
const CoeffReturnType coeffByOuterInner(int outer, int inner) const;
const CoeffReturnType operator()(int row, int col) const; const CoeffReturnType operator()(int row, int col) const;
Scalar& coeffRef(int row, int col); Scalar& coeffRef(int row, int col);
Scalar& coeffRefByOuterInner(int outer, int inner);
Scalar& operator()(int row, int col); Scalar& operator()(int row, int col);
const CoeffReturnType coeff(int index) const; const CoeffReturnType coeff(int index) const;
@ -259,17 +344,30 @@ template<typename Derived> class DenseBase
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeff(int row, int col, const DenseBase<OtherDerived>& other); void copyCoeff(int row, int col, const DenseBase<OtherDerived>& other);
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeffByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other);
template<typename OtherDerived>
void copyCoeff(int index, const DenseBase<OtherDerived>& other); void copyCoeff(int index, const DenseBase<OtherDerived>& other);
template<typename OtherDerived, int StoreMode, int LoadMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int row, int col, const DenseBase<OtherDerived>& other); void copyPacket(int row, int col, const DenseBase<OtherDerived>& other);
template<typename OtherDerived, int StoreMode, int LoadMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacketByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other);
template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int index, const DenseBase<OtherDerived>& other); void copyPacket(int index, const DenseBase<OtherDerived>& other);
private:
static int rowIndexByOuterInner(int outer, int inner);
static int colIndexByOuterInner(int outer, int inner);
public:
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
template<int LoadMode> template<int LoadMode>
PacketScalar packet(int row, int col) const; PacketScalar packet(int row, int col) const;
template<int LoadMode>
PacketScalar packetByOuterInner(int outer, int inner) const;
template<int StoreMode> template<int StoreMode>
void writePacket(int row, int col, const PacketScalar& x); void writePacket(int row, int col, const PacketScalar& x);
template<int StoreMode>
void writePacketByOuterInner(int outer, int inner, const PacketScalar& x);
template<int LoadMode> template<int LoadMode>
PacketScalar packet(int index) const; PacketScalar packet(int index) const;
@ -409,13 +507,6 @@ template<typename Derived> class DenseBase
template<typename OtherDerived> template<typename OtherDerived>
void swap(DenseBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other); void swap(DenseBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other);
/** \returns number of elements to skip to pass from one row (resp. column) to another
* for a row-major (resp. column-major) matrix.
* Combined with coeffRef() and the \ref flags flags, it allows a direct access to the data
* of the underlying matrix.
*/
inline int stride() const { return derived().stride(); }
inline const NestByValue<Derived> nestByValue() const; inline const NestByValue<Derived> nestByValue() const;
inline const ForceAlignedAccess<Derived> forceAlignedAccess() const; inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
inline ForceAlignedAccess<Derived> forceAlignedAccess(); inline ForceAlignedAccess<Derived> forceAlignedAccess();

View File

@ -75,23 +75,6 @@ class DenseStorageBase : public _Base<Derived>
EIGEN_STRONG_INLINE int rows() const { return m_storage.rows(); } EIGEN_STRONG_INLINE int rows() const { return m_storage.rows(); }
EIGEN_STRONG_INLINE int cols() const { return m_storage.cols(); } EIGEN_STRONG_INLINE int cols() const { return m_storage.cols(); }
/** Returns the leading dimension (for matrices) or the increment (for vectors) to be used with data().
*
* More precisely:
* - for a column major matrix it returns the number of elements between two successive columns
* - for a row major matrix it returns the number of elements between two successive rows
* - for a vector it returns the number of elements between two successive coefficients
* This function has to be used together with the MapBase::data() function.
*
* \sa data() */
EIGEN_STRONG_INLINE int stride() const
{
if(IsVectorAtCompileTime)
return 1;
else
return (Flags & RowMajorBit) ? m_storage.cols() : m_storage.rows();
}
EIGEN_STRONG_INLINE const Scalar& coeff(int row, int col) const EIGEN_STRONG_INLINE const Scalar& coeff(int row, int col) const
{ {
if(Flags & RowMajorBit) if(Flags & RowMajorBit)
@ -253,12 +236,12 @@ class DenseStorageBase : public _Base<Derived>
{ {
if(RowsAtCompileTime == 1) if(RowsAtCompileTime == 1)
{ {
ei_assert(other.isVector()); ei_assert(other.rows() == 1 || other.cols() == 1);
resize(1, other.size()); resize(1, other.size());
} }
else if(ColsAtCompileTime == 1) else if(ColsAtCompileTime == 1)
{ {
ei_assert(other.isVector()); ei_assert(other.rows() == 1 || other.cols() == 1);
resize(other.size(), 1); resize(other.size(), 1);
} }
else resize(other.rows(), other.cols()); else resize(other.rows(), other.cols());
@ -379,6 +362,9 @@ class DenseStorageBase : public _Base<Derived>
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers. * \a data pointers.
* *
* These methods do not allow to specify strides. If you need to specify strides, you have to
* use the Map class directly.
*
* \see class Map * \see class Map
*/ */
//@{ //@{
@ -544,12 +530,22 @@ struct ei_conservative_resize_like_impl
{ {
if (_this.rows() == rows && _this.cols() == cols) return; if (_this.rows() == rows && _this.cols() == cols) return;
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns
{
_this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
}
else
{
// The storage order does not allow us to use reallocation.
typename Derived::PlainObject tmp(rows,cols); typename Derived::PlainObject tmp(rows,cols);
const int common_rows = std::min(rows, _this.rows()); const int common_rows = std::min(rows, _this.rows());
const int common_cols = std::min(cols, _this.cols()); const int common_cols = std::min(cols, _this.cols());
tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
_this.derived().swap(tmp); _this.derived().swap(tmp);
} }
}
static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other) static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
{ {
@ -563,12 +559,27 @@ struct ei_conservative_resize_like_impl
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns
{
const int new_rows = other.rows() - _this.rows();
const int new_cols = other.cols() - _this.cols();
_this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols());
if (new_rows>0)
_this.corner(BottomRight, new_rows, other.cols()) = other.corner(BottomRight, new_rows, other.cols());
else if (new_cols>0)
_this.corner(BottomRight, other.rows(), new_cols) = other.corner(BottomRight, other.rows(), new_cols);
}
else
{
// The storage order does not allow us to use reallocation.
typename Derived::PlainObject tmp(other); typename Derived::PlainObject tmp(other);
const int common_rows = std::min(tmp.rows(), _this.rows()); const int common_rows = std::min(tmp.rows(), _this.rows());
const int common_cols = std::min(tmp.cols(), _this.cols()); const int common_cols = std::min(tmp.cols(), _this.cols());
tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
_this.derived().swap(tmp); _this.derived().swap(tmp);
} }
}
}; };
template <typename Derived, typename OtherDerived> template <typename Derived, typename OtherDerived>
@ -576,22 +587,23 @@ struct ei_conservative_resize_like_impl<Derived,OtherDerived,true>
{ {
static void run(DenseBase<Derived>& _this, int size) static void run(DenseBase<Derived>& _this, int size)
{ {
if (_this.size() == size) return; const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
typename Derived::PlainObject tmp(size); const int new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
const int common_size = std::min<int>(_this.size(),size); _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
tmp.segment(0,common_size) = _this.segment(0,common_size);
_this.derived().swap(tmp);
} }
static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other) static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
{ {
if (_this.rows() == other.rows() && _this.cols() == other.cols()) return; if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
// segment(...) will check whether Derived/OtherDerived are vectors! const int num_new_elements = other.size() - _this.size();
typename Derived::PlainObject tmp(other);
const int common_size = std::min<int>(_this.size(),tmp.size()); const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
tmp.segment(0,common_size) = _this.segment(0,common_size); const int new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
_this.derived().swap(tmp); _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
if (num_new_elements > 0)
_this.tail(num_new_elements) = other.tail(num_new_elements);
} }
}; };

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2008, 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -25,224 +25,35 @@
#ifndef EIGEN_DOT_H #ifndef EIGEN_DOT_H
#define EIGEN_DOT_H #define EIGEN_DOT_H
/*************************************************************************** // helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
* Part 1 : the logic deciding a strategy for vectorization and unrolling // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
***************************************************************************/ // looking at the static assertions. Thus this is a trick to get better compile errors.
template<typename T, typename U,
template<typename Derived1, typename Derived2> // the NeedToTranspose condition here is taken straight from Assign.h
struct ei_dot_traits bool NeedToTranspose = T::IsVectorAtCompileTime
{ && U::IsVectorAtCompileTime
public: && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
enum { | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
Traversal = (int(Derived1::Flags)&int(Derived2::Flags)&ActualPacketAccessBit) // revert to || as soon as not needed anymore.
&& (int(Derived1::Flags)&int(Derived2::Flags)&LinearAccessBit) (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
? LinearVectorizedTraversal
: DefaultTraversal
};
private:
typedef typename Derived1::Scalar Scalar;
enum {
PacketSize = ei_packet_traits<Scalar>::size,
Cost = Derived1::SizeAtCompileTime * (Derived1::CoeffReadCost + Derived2::CoeffReadCost + NumTraits<Scalar>::MulCost)
+ (Derived1::SizeAtCompileTime-1) * NumTraits<Scalar>::AddCost,
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
};
public:
enum {
Unrolling = Cost <= UnrollingLimit
? CompleteUnrolling
: NoUnrolling
};
};
/***************************************************************************
* Part 2 : unrollers
***************************************************************************/
/*** no vectorization ***/
template<typename Derived1, typename Derived2, int Start, int Length>
struct ei_dot_novec_unroller
{
enum {
HalfLength = Length/2
};
typedef typename Derived1::Scalar Scalar;
inline static Scalar run(const Derived1& v1, const Derived2& v2)
{
return ei_dot_novec_unroller<Derived1, Derived2, Start, HalfLength>::run(v1, v2)
+ ei_dot_novec_unroller<Derived1, Derived2, Start+HalfLength, Length-HalfLength>::run(v1, v2);
}
};
template<typename Derived1, typename Derived2, int Start>
struct ei_dot_novec_unroller<Derived1, Derived2, Start, 1>
{
typedef typename Derived1::Scalar Scalar;
inline static Scalar run(const Derived1& v1, const Derived2& v2)
{
return ei_conj(v1.coeff(Start)) * v2.coeff(Start);
}
};
/*** vectorization ***/
template<typename Derived1, typename Derived2, int Index, int Stop,
bool LastPacket = (Stop-Index == ei_packet_traits<typename Derived1::Scalar>::size)>
struct ei_dot_vec_unroller
{
typedef typename Derived1::Scalar Scalar;
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
enum {
row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0
};
inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
{
return ei_pmadd(
v1.template packet<Aligned>(row1, col1),
v2.template packet<Aligned>(row2, col2),
ei_dot_vec_unroller<Derived1, Derived2, Index+ei_packet_traits<Scalar>::size, Stop>::run(v1, v2)
);
}
};
template<typename Derived1, typename Derived2, int Index, int Stop>
struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
{
enum {
row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0,
alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
};
typedef typename Derived1::Scalar Scalar;
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
{
return ei_pmul(v1.template packet<alignment1>(row1, col1), v2.template packet<alignment2>(row2, col2));
}
};
/***************************************************************************
* Part 3 : implementation of all cases
***************************************************************************/
template<typename Derived1, typename Derived2,
int Traversal = ei_dot_traits<Derived1, Derived2>::Traversal,
int Unrolling = ei_dot_traits<Derived1, Derived2>::Unrolling
> >
struct ei_dot_impl; struct ei_dot_nocheck
template<typename Derived1, typename Derived2>
struct ei_dot_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
{ {
typedef typename Derived1::Scalar Scalar; static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
static Scalar run(const Derived1& v1, const Derived2& v2)
{ {
ei_assert(v1.size()>0 && "you are using a non initialized vector"); return a.conjugate().cwiseProduct(b).sum();
Scalar res;
res = ei_conj(v1.coeff(0)) * v2.coeff(0);
for(int i = 1; i < v1.size(); ++i)
res += ei_conj(v1.coeff(i)) * v2.coeff(i);
return res;
} }
}; };
template<typename Derived1, typename Derived2> template<typename T, typename U>
struct ei_dot_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling> struct ei_dot_nocheck<T, U, true>
: public ei_dot_novec_unroller<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
{};
template<typename Derived1, typename Derived2>
struct ei_dot_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling>
{ {
typedef typename Derived1::Scalar Scalar; static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
static Scalar run(const Derived1& v1, const Derived2& v2)
{ {
const int size = v1.size(); return a.adjoint().cwiseProduct(b).sum();
const int packetSize = ei_packet_traits<Scalar>::size;
const int alignedSize = (size/packetSize)*packetSize;
enum {
alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
};
Scalar res;
// do the vectorizable part of the sum
if(size >= packetSize)
{
PacketScalar packet_res = ei_pmul(
v1.template packet<alignment1>(0),
v2.template packet<alignment2>(0)
);
for(int index = packetSize; index<alignedSize; index += packetSize)
{
packet_res = ei_pmadd(
v1.template packet<alignment1>(index),
v2.template packet<alignment2>(index),
packet_res
);
}
res = ei_predux(packet_res);
// now we must do the rest without vectorization.
if(alignedSize == size) return res;
}
else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
{
res = Scalar(0);
}
// do the remainder of the vector
for(int index = alignedSize; index < size; ++index)
{
res += v1.coeff(index) * v2.coeff(index);
}
return res;
} }
}; };
template<typename Derived1, typename Derived2>
struct ei_dot_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling>
{
typedef typename Derived1::Scalar Scalar;
typedef typename ei_packet_traits<Scalar>::type PacketScalar;
enum {
PacketSize = ei_packet_traits<Scalar>::size,
Size = Derived1::SizeAtCompileTime,
VectorizedSize = (Size / PacketSize) * PacketSize
};
static Scalar run(const Derived1& v1, const Derived2& v2)
{
Scalar res = ei_predux(ei_dot_vec_unroller<Derived1, Derived2, 0, VectorizedSize>::run(v1, v2));
if (VectorizedSize != Size)
res += ei_dot_novec_unroller<Derived1, Derived2, VectorizedSize, Size-VectorizedSize>::run(v1, v2);
return res;
}
};
/***************************************************************************
* Part 4 : implementation of MatrixBase methods
***************************************************************************/
/** \returns the dot product of *this with other. /** \returns the dot product of *this with other.
* *
* \only_for_vectors * \only_for_vectors
@ -266,10 +77,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
ei_assert(size() == other.size()); ei_assert(size() == other.size());
// dot() must honor EvalBeforeNestingBit (eg: v.dot(M*v) ) return ei_dot_nocheck<Derived,OtherDerived>::run(*this, other);
typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
typedef typename ei_cleantype<typename OtherDerived::Nested>::type OtherNested;
return ei_dot_impl<ThisNested, OtherNested>::run(derived(), other.derived());
} }
/** \returns the squared \em l2 norm of *this, i.e., for vectors, the dot product of *this with itself. /** \returns the squared \em l2 norm of *this, i.e., for vectors, the dot product of *this with itself.

View File

@ -60,7 +60,8 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
inline int stride() const { return m_matrix.stride(); } inline int outerStride() const { return m_matrix.outerStride(); }
inline int innerStride() const { return m_matrix.innerStride(); }
inline const Scalar coeff(int row, int col) const inline const Scalar coeff(int row, int col) const
{ {

View File

@ -52,7 +52,8 @@ template<typename ExpressionType> class ForceAlignedAccess
inline int rows() const { return m_expression.rows(); } inline int rows() const { return m_expression.rows(); }
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int outerStride() const { return m_expression.outerStride(); }
inline int innerStride() const { return m_expression.innerStride(); }
inline const CoeffReturnType coeff(int row, int col) const inline const CoeffReturnType coeff(int row, int col) const
{ {

View File

@ -179,7 +179,7 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
enum { enum {
Cost = 2 * NumTraits<Scalar>::MulCost, Cost = 2 * NumTraits<Scalar>::MulCost,
PacketAccess = ei_packet_traits<Scalar>::size>1 PacketAccess = ei_packet_traits<Scalar>::size>1
#if (defined EIGEN_VECTORIZE_SSE) #if (defined EIGEN_VECTORIZE)
&& NumTraits<Scalar>::HasFloatingPoint && NumTraits<Scalar>::HasFloatingPoint
#endif #endif
}; };

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
@ -33,10 +33,35 @@
* \param MatrixType the equivalent matrix type of the mapped data * \param MatrixType the equivalent matrix type of the mapped data
* \param Options specifies whether the pointer is \c Aligned, or \c Unaligned. * \param Options specifies whether the pointer is \c Aligned, or \c Unaligned.
* The default is \c Unaligned. * The default is \c Unaligned.
* \param StrideType optionnally specifies strides. By default, Map assumes the memory layout
* of an ordinary, contiguous array. This can be overridden by specifying strides.
* The type passed here must be a specialization of the Stride template, see examples below.
* *
* This class represents a matrix or vector expression mapping an existing array of data. * This class represents a matrix or vector expression mapping an existing array of data.
* It can be used to let Eigen interface without any overhead with non-Eigen data structures, * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
* such as plain C arrays or structures from other libraries. * such as plain C arrays or structures from other libraries. By default, it assumes that the
* data is laid out contiguously in memory. You can however override this by explicitly specifying
* inner and outer strides.
*
* Here's an example of simply mapping a contiguous array as a column-major matrix:
* \include Map_simple.cpp
* Output: \verbinclude Map_simple.out
*
* If you need to map non-contiguous arrays, you can do so by specifying strides:
*
* Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
* increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
* fixed value.
* \include Map_inner_stride.cpp
* Output: \verbinclude Map_inner_stride.out
*
* Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
* as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
* Here, we're specifying the outer stride as a runtime parameter.
* \include Map_outer_stride.cpp
* Output: \verbinclude Map_outer_stride.out
*
* For more details and for an example of specifying both an inner and an outer stride, see class Stride.
* *
* \b Tip: to change the array of data mapped by a Map object, you can use the C++ * \b Tip: to change the array of data mapped by a Map object, you can use the C++
* placement new syntax: * placement new syntax:
@ -48,48 +73,86 @@
* *
* \sa Matrix::Map() * \sa Matrix::Map()
*/ */
template<typename MatrixType, int Options> template<typename MatrixType, int Options, typename StrideType>
struct ei_traits<Map<MatrixType, Options> > : public ei_traits<MatrixType> struct ei_traits<Map<MatrixType, Options, StrideType> >
: public ei_traits<MatrixType>
{ {
typedef typename MatrixType::Scalar Scalar;
enum { enum {
Flags = (Options&Aligned)==Aligned ? ei_traits<MatrixType>::Flags | AlignedBit InnerStride = StrideType::InnerStrideAtCompileTime,
: ei_traits<MatrixType>::Flags & ~AlignedBit OuterStride = StrideType::OuterStrideAtCompileTime,
HasNoInnerStride = InnerStride <= 1,
HasNoOuterStride = OuterStride == 0,
HasNoStride = HasNoInnerStride && HasNoOuterStride,
IsAligned = int(int(Options)&Aligned)==Aligned,
IsDynamicSize = MatrixType::SizeAtCompileTime==Dynamic,
KeepsPacketAccess = bool(HasNoInnerStride)
&& ( bool(IsDynamicSize)
|| HasNoOuterStride
|| ( OuterStride!=Dynamic && ((int(OuterStride)*sizeof(Scalar))%16)==0 ) ),
Flags0 = ei_traits<MatrixType>::Flags,
Flags1 = IsAligned ? int(Flags0) | AlignedBit : int(Flags0) & ~AlignedBit,
Flags2 = HasNoStride ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
}; };
}; };
template<typename MatrixType, int Options> class Map template<typename MatrixType, int Options, typename StrideType> class Map
: public MapBase<Map<MatrixType, Options>, : public MapBase<Map<MatrixType, Options, StrideType>,
typename MatrixType::template MakeBase< Map<MatrixType, Options> >::Type> typename MatrixType::template MakeBase<
Map<MatrixType, Options, StrideType>
>::Type>
{ {
public: public:
typedef MapBase<Map,typename MatrixType::template MakeBase<Map>::Type> Base; typedef MapBase<Map,typename MatrixType::template MakeBase<Map>::Type> Base;
EIGEN_DENSE_PUBLIC_INTERFACE(Map) EIGEN_DENSE_PUBLIC_INTERFACE(Map)
inline int stride() const { return this->innerSize(); } inline int innerStride() const
inline Map(const Scalar* data) : Base(data) {}
inline Map(const Scalar* data, int size) : Base(data, size) {}
inline Map(const Scalar* data, int rows, int cols) : Base(data, rows, cols) {}
inline void resize(int rows, int cols)
{ {
EIGEN_ONLY_USED_FOR_DEBUG(rows); return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
EIGEN_ONLY_USED_FOR_DEBUG(cols);
ei_assert(rows == this->rows());
ei_assert(cols == this->cols());
} }
inline void resize(int size) inline int outerStride() const
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(MatrixType) return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
EIGEN_ONLY_USED_FOR_DEBUG(size); : IsVectorAtCompileTime ? this->size()
ei_assert(size == this->size()); : int(Flags)&RowMajorBit ? this->cols()
: this->rows();
} }
/** Constructor in the fixed-size case.
*
* \param data pointer to the array to map
* \param stride optional Stride object, passing the strides.
*/
inline Map(const Scalar* data, const StrideType& stride = StrideType())
: Base(data), m_stride(stride) {}
/** Constructor in the dynamic-size vector case.
*
* \param data pointer to the array to map
* \param size the size of the vector expression
* \param stride optional Stride object, passing the strides.
*/
inline Map(const Scalar* data, int size, const StrideType& stride = StrideType())
: Base(data, size), m_stride(stride) {}
/** Constructor in the dynamic-size matrix case.
*
* \param data pointer to the array to map
* \param rows the number of rows of the matrix expression
* \param cols the number of columns of the matrix expression
* \param stride optional Stride object, passing the strides.
*/
inline Map(const Scalar* data, int rows, int cols, const StrideType& stride = StrideType())
: Base(data, rows, cols), m_stride(stride) {}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
protected:
StrideType m_stride;
}; };
template<typename _Scalar, int _Rows, int _Cols, int _StorageOrder, int _MaxRows, int _MaxCols> template<typename _Scalar, int _Rows, int _Cols, int _StorageOrder, int _MaxRows, int _MaxCols>

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
@ -37,9 +37,7 @@ template<typename Derived, typename Base> class MapBase
{ {
public: public:
// typedef MatrixBase<Derived> Base;
enum { enum {
IsRowMajor = (int(ei_traits<Derived>::Flags) & RowMajorBit) ? 1 : 0,
RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime, RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime, ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime,
SizeAtCompileTime = Base::SizeAtCompileTime SizeAtCompileTime = Base::SizeAtCompileTime
@ -48,94 +46,75 @@ template<typename Derived, typename Base> class MapBase
typedef typename ei_traits<Derived>::Scalar Scalar; typedef typename ei_traits<Derived>::Scalar Scalar;
typedef typename Base::PacketScalar PacketScalar; typedef typename Base::PacketScalar PacketScalar;
using Base::derived; using Base::derived;
using Base::innerStride;
using Base::outerStride;
using Base::rowStride;
using Base::colStride;
inline int rows() const { return m_rows.value(); } inline int rows() const { return m_rows.value(); }
inline int cols() const { return m_cols.value(); } inline int cols() const { return m_cols.value(); }
/** Returns the leading dimension (for matrices) or the increment (for vectors) to be used with data().
*
* More precisely:
* - for a column major matrix it returns the number of elements between two successive columns
* - for a row major matrix it returns the number of elements between two successive rows
* - for a vector it returns the number of elements between two successive coefficients
* This function has to be used together with the MapBase::data() function.
*
* \sa MapBase::data() */
inline int stride() const { return derived().stride(); }
/** Returns a pointer to the first coefficient of the matrix or vector. /** Returns a pointer to the first coefficient of the matrix or vector.
* This function has to be used together with the stride() function.
* *
* \sa MapBase::stride() */ * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
*
* \sa innerStride(), outerStride()
*/
inline const Scalar* data() const { return m_data; } inline const Scalar* data() const { return m_data; }
inline const Scalar& coeff(int row, int col) const inline const Scalar& coeff(int row, int col) const
{ {
if(IsRowMajor) return m_data[col * colStride() + row * rowStride()];
return m_data[col + row * stride()];
else // column-major
return m_data[row + col * stride()];
} }
inline Scalar& coeffRef(int row, int col) inline Scalar& coeffRef(int row, int col)
{ {
if(IsRowMajor) return const_cast<Scalar*>(m_data)[col * colStride() + row * rowStride()];
return const_cast<Scalar*>(m_data)[col + row * stride()];
else // column-major
return const_cast<Scalar*>(m_data)[row + col * stride()];
} }
inline const Scalar& coeff(int index) const inline const Scalar& coeff(int index) const
{ {
ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit)); ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
if ( ((RowsAtCompileTime == 1) == IsRowMajor) || !int(Derived::IsVectorAtCompileTime) ) return m_data[index * innerStride()];
return m_data[index];
else
return m_data[index*stride()];
} }
inline Scalar& coeffRef(int index) inline Scalar& coeffRef(int index)
{ {
ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit)); ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
if ( ((RowsAtCompileTime == 1) == IsRowMajor) || !int(Derived::IsVectorAtCompileTime) ) return const_cast<Scalar*>(m_data)[index * innerStride()];
return const_cast<Scalar*>(m_data)[index];
else
return const_cast<Scalar*>(m_data)[index*stride()];
} }
template<int LoadMode> template<int LoadMode>
inline PacketScalar packet(int row, int col) const inline PacketScalar packet(int row, int col) const
{ {
return ei_ploadt<Scalar, LoadMode> return ei_ploadt<Scalar, LoadMode>
(m_data + (IsRowMajor ? col + row * stride() (m_data + (col * colStride() + row * rowStride()));
: row + col * stride()));
} }
template<int LoadMode> template<int LoadMode>
inline PacketScalar packet(int index) const inline PacketScalar packet(int index) const
{ {
return ei_ploadt<Scalar, LoadMode>(m_data + index); return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride());
} }
template<int StoreMode> template<int StoreMode>
inline void writePacket(int row, int col, const PacketScalar& x) inline void writePacket(int row, int col, const PacketScalar& x)
{ {
ei_pstoret<Scalar, PacketScalar, StoreMode> ei_pstoret<Scalar, PacketScalar, StoreMode>
(const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride() (const_cast<Scalar*>(m_data) + (col * colStride() + row * rowStride()), x);
: row + col * stride()), x);
} }
template<int StoreMode> template<int StoreMode>
inline void writePacket(int index, const PacketScalar& x) inline void writePacket(int index, const PacketScalar& x)
{ {
ei_pstoret<Scalar, PacketScalar, StoreMode> ei_pstoret<Scalar, PacketScalar, StoreMode>
(const_cast<Scalar*>(m_data) + index, x); (const_cast<Scalar*>(m_data) + index * innerStride(), x);
} }
inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
{ {
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
checkDataAlignment(); checkSanity();
} }
inline MapBase(const Scalar* data, int size) inline MapBase(const Scalar* data, int size)
@ -146,7 +125,7 @@ template<typename Derived, typename Base> class MapBase
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
ei_assert(size >= 0); ei_assert(size >= 0);
ei_assert(data == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == size); ei_assert(data == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
checkDataAlignment(); checkSanity();
} }
inline MapBase(const Scalar* data, int rows, int cols) inline MapBase(const Scalar* data, int rows, int cols)
@ -155,7 +134,7 @@ template<typename Derived, typename Base> class MapBase
ei_assert( (data == 0) ei_assert( (data == 0)
|| ( rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) || ( rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
&& cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols))); && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
checkDataAlignment(); checkSanity();
} }
Derived& operator=(const MapBase& other) Derived& operator=(const MapBase& other)
@ -167,10 +146,12 @@ template<typename Derived, typename Base> class MapBase
protected: protected:
void checkDataAlignment() const void checkSanity() const
{ {
ei_assert( ((!(ei_traits<Derived>::Flags&AlignedBit)) ei_assert( ((!(ei_traits<Derived>::Flags&AlignedBit))
|| ((size_t(m_data)&0xf)==0)) && "data is not aligned"); || ((size_t(m_data)&0xf)==0)) && "data is not aligned");
ei_assert( ((!(ei_traits<Derived>::Flags&PacketAccessBit))
|| (innerStride()==1)) && "packet access incompatible with inner stride greater than 1");
} }
const Scalar* EIGEN_RESTRICT m_data; const Scalar* EIGEN_RESTRICT m_data;

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
@ -318,6 +318,9 @@ class Matrix
void swap(MatrixBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other) void swap(MatrixBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
{ this->_swap(other.derived()); } { this->_swap(other.derived()); }
inline int innerStride() const { return 1; }
inline int outerStride() const { return this->innerSize(); }
/////////// Geometry module /////////// /////////// Geometry module ///////////
template<typename OtherDerived> template<typename OtherDerived>
@ -331,6 +334,9 @@ class Matrix
#endif #endif
protected: protected:
template <typename Derived, typename OtherDerived, bool IsVector>
friend struct ei_conservative_resize_like_impl;
using Base::m_storage; using Base::m_storage;
}; };

View File

@ -169,15 +169,6 @@ template<typename Derived> class MatrixBase
Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other); Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
#endif // not EIGEN_PARSED_BY_DOXYGEN #endif // not EIGEN_PARSED_BY_DOXYGEN
const CoeffReturnType x() const;
const CoeffReturnType y() const;
const CoeffReturnType z() const;
const CoeffReturnType w() const;
Scalar& x();
Scalar& y();
Scalar& z();
Scalar& w();
template<typename OtherDerived> template<typename OtherDerived>
Derived& operator+=(const MatrixBase<OtherDerived>& other); Derived& operator+=(const MatrixBase<OtherDerived>& other);
template<typename OtherDerived> template<typename OtherDerived>

View File

@ -3,6 +3,7 @@
// //
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -92,6 +93,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class ei_matr
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); }
inline static int rows(void) {return _Rows;} inline static int rows(void) {return _Rows;}
inline static int cols(void) {return _Cols;} inline static int cols(void) {return _Cols;}
inline void conservativeResize(int,int,int) {}
inline void resize(int,int,int) {} inline void resize(int,int,int) {}
inline const T *data() const { return m_data.array; } inline const T *data() const { return m_data.array; }
inline T *data() { return m_data.array; } inline T *data() { return m_data.array; }
@ -107,6 +109,7 @@ template<typename T, int _Rows, int _Cols, int _Options> class ei_matrix_storage
inline void swap(ei_matrix_storage& ) {} inline void swap(ei_matrix_storage& ) {}
inline static int rows(void) {return _Rows;} inline static int rows(void) {return _Rows;}
inline static int cols(void) {return _Cols;} inline static int cols(void) {return _Cols;}
inline void conservativeResize(int,int,int) {}
inline void resize(int,int,int) {} inline void resize(int,int,int) {}
inline const T *data() const { return 0; } inline const T *data() const { return 0; }
inline T *data() { return 0; } inline T *data() { return 0; }
@ -127,11 +130,8 @@ template<typename T, int Size, int _Options> class ei_matrix_storage<T, Size, Dy
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
inline int rows(void) const {return m_rows;} inline int rows(void) const {return m_rows;}
inline int cols(void) const {return m_cols;} inline int cols(void) const {return m_cols;}
inline void resize(int, int rows, int cols) inline void conservativeResize(int, int rows, int cols) { m_rows = rows; m_cols = cols; }
{ inline void resize(int, int rows, int cols) { m_rows = rows; m_cols = cols; }
m_rows = rows;
m_cols = cols;
}
inline const T *data() const { return m_data.array; } inline const T *data() const { return m_data.array; }
inline T *data() { return m_data.array; } inline T *data() { return m_data.array; }
}; };
@ -149,10 +149,8 @@ template<typename T, int Size, int _Cols, int _Options> class ei_matrix_storage<
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
inline int rows(void) const {return m_rows;} inline int rows(void) const {return m_rows;}
inline int cols(void) const {return _Cols;} inline int cols(void) const {return _Cols;}
inline void resize(int /*size*/, int rows, int) inline void conservativeResize(int, int rows, int) { m_rows = rows; }
{ inline void resize(int, int rows, int) { m_rows = rows; }
m_rows = rows;
}
inline const T *data() const { return m_data.array; } inline const T *data() const { return m_data.array; }
inline T *data() { return m_data.array; } inline T *data() { return m_data.array; }
}; };
@ -170,10 +168,8 @@ template<typename T, int Size, int _Rows, int _Options> class ei_matrix_storage<
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
inline int rows(void) const {return _Rows;} inline int rows(void) const {return _Rows;}
inline int cols(void) const {return m_cols;} inline int cols(void) const {return m_cols;}
inline void resize(int, int, int cols) inline void conservativeResize(int, int, int cols) { m_cols = cols; }
{ inline void resize(int, int, int cols) { m_cols = cols; }
m_cols = cols;
}
inline const T *data() const { return m_data.array; } inline const T *data() const { return m_data.array; }
inline T *data() { return m_data.array; } inline T *data() { return m_data.array; }
}; };
@ -196,6 +192,12 @@ template<typename T, int _Options> class ei_matrix_storage<T, Dynamic, Dynamic,
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
inline int rows(void) const {return m_rows;} inline int rows(void) const {return m_rows;}
inline int cols(void) const {return m_cols;} inline int cols(void) const {return m_cols;}
inline void conservativeResize(int size, int rows, int cols)
{
m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
m_rows = rows;
m_cols = cols;
}
void resize(int size, int rows, int cols) void resize(int size, int rows, int cols)
{ {
if(size != m_rows*m_cols) if(size != m_rows*m_cols)
@ -228,6 +230,11 @@ template<typename T, int _Rows, int _Options> class ei_matrix_storage<T, Dynamic
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
inline static int rows(void) {return _Rows;} inline static int rows(void) {return _Rows;}
inline int cols(void) const {return m_cols;} inline int cols(void) const {return m_cols;}
inline void conservativeResize(int size, int, int cols)
{
m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
m_cols = cols;
}
void resize(int size, int, int cols) void resize(int size, int, int cols)
{ {
if(size != _Rows*m_cols) if(size != _Rows*m_cols)
@ -259,6 +266,11 @@ template<typename T, int _Cols, int _Options> class ei_matrix_storage<T, Dynamic
inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
inline int rows(void) const {return m_rows;} inline int rows(void) const {return m_rows;}
inline static int cols(void) {return _Cols;} inline static int cols(void) {return _Cols;}
inline void conservativeResize(int size, int rows, int)
{
m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
m_rows = rows;
}
void resize(int size, int rows, int) void resize(int size, int rows, int)
{ {
if(size != m_rows*_Cols) if(size != m_rows*_Cols)

View File

@ -53,7 +53,8 @@ template<typename ExpressionType> class NestByValue
inline int rows() const { return m_expression.rows(); } inline int rows() const { return m_expression.rows(); }
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int outerStride() const { return m_expression.outerStride(); }
inline int innerStride() const { return m_expression.innerStride(); }
inline const CoeffReturnType coeff(int row, int col) const inline const CoeffReturnType coeff(int row, int col) const
{ {

View File

@ -47,7 +47,7 @@
* \sa class DiagonalMatrix * \sa class DiagonalMatrix
*/ */
template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime> class PermutationMatrix; template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime> class PermutationMatrix;
template<typename PermutationType, typename MatrixType, int Side> struct ei_permut_matrix_product_retval; template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false> struct ei_permut_matrix_product_retval;
template<int SizeAtCompileTime, int MaxSizeAtCompileTime> template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
struct ei_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> > struct ei_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> >
@ -132,6 +132,9 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
/** \returns the number of columns */ /** \returns the number of columns */
inline int cols() const { return m_indices.size(); } inline int cols() const { return m_indices.size(); }
/** \returns the size of a side of the respective square matrix, i.e., the number of indices */
inline int size() const { return m_indices.size(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename DenseDerived> template<typename DenseDerived>
void evalTo(MatrixBase<DenseDerived>& other) const void evalTo(MatrixBase<DenseDerived>& other) const
@ -213,16 +216,29 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
return *this; return *this;
} }
/**** inversion and multiplication helpers to hopefully get RVO ****/ /** \returns the inverse permutation matrix.
*
* \note \note_try_to_help_rvo
*/
inline Transpose<PermutationMatrix> inverse() const
{ return *this; }
/** \returns the tranpose permutation matrix.
*
* \note \note_try_to_help_rvo
*/
inline Transpose<PermutationMatrix> transpose() const
{ return *this; }
/**** multiplication helpers to hopefully get RVO ****/
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
protected: template<int OtherSize, int OtherMaxSize>
enum Inverse_t {Inverse}; PermutationMatrix(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other)
PermutationMatrix(Inverse_t, const PermutationMatrix& other) : m_indices(other.nestedPermutation().size())
: m_indices(other.m_indices.size())
{ {
for (int i=0; i<rows();++i) m_indices.coeffRef(other.m_indices.coeff(i)) = i; for (int i=0; i<rows();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
} }
protected:
enum Product_t {Product}; enum Product_t {Product};
PermutationMatrix(Product_t, const PermutationMatrix& lhs, const PermutationMatrix& rhs) PermutationMatrix(Product_t, const PermutationMatrix& lhs, const PermutationMatrix& rhs)
: m_indices(lhs.m_indices.size()) : m_indices(lhs.m_indices.size())
@ -233,12 +249,7 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
#endif #endif
public: public:
/** \returns the inverse permutation matrix.
*
* \note \note_try_to_help_rvo
*/
inline PermutationMatrix inverse() const
{ return PermutationMatrix(Inverse, *this); }
/** \returns the product permutation matrix. /** \returns the product permutation matrix.
* *
* \note \note_try_to_help_rvo * \note \note_try_to_help_rvo
@ -247,6 +258,22 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
inline PermutationMatrix operator*(const PermutationMatrix<OtherSize, OtherMaxSize>& other) const inline PermutationMatrix operator*(const PermutationMatrix<OtherSize, OtherMaxSize>& other) const
{ return PermutationMatrix(Product, *this, other); } { return PermutationMatrix(Product, *this, other); }
/** \returns the product of a permutation with another inverse permutation.
*
* \note \note_try_to_help_rvo
*/
template<int OtherSize, int OtherMaxSize>
inline PermutationMatrix operator*(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other) const
{ return PermutationMatrix(Product, *this, other.eval()); }
/** \returns the product of an inverse permutation with another permutation.
*
* \note \note_try_to_help_rvo
*/
template<int OtherSize, int OtherMaxSize> friend
inline PermutationMatrix operator*(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other, const PermutationMatrix& perm)
{ return PermutationMatrix(Product, other.eval(), perm); }
protected: protected:
IndicesType m_indices; IndicesType m_indices;
@ -277,15 +304,15 @@ operator*(const PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> &perm
(permutation, matrix.derived()); (permutation, matrix.derived());
} }
template<typename PermutationType, typename MatrixType, int Side> template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
struct ei_traits<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side> > struct ei_traits<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
{ {
typedef typename MatrixType::PlainObject ReturnType; typedef typename MatrixType::PlainObject ReturnType;
}; };
template<typename PermutationType, typename MatrixType, int Side> template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
struct ei_permut_matrix_product_retval struct ei_permut_matrix_product_retval
: public ReturnByValue<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side> > : public ReturnByValue<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
{ {
typedef typename ei_cleantype<typename MatrixType::Nested>::type MatrixTypeNestedCleaned; typedef typename ei_cleantype<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
@ -299,21 +326,46 @@ struct ei_permut_matrix_product_retval
template<typename Dest> inline void evalTo(Dest& dst) const template<typename Dest> inline void evalTo(Dest& dst) const
{ {
const int n = Side==OnTheLeft ? rows() : cols(); const int n = Side==OnTheLeft ? rows() : cols();
if(ei_is_same_type<MatrixTypeNestedCleaned,Dest>::ret && ei_extract_data(dst) == ei_extract_data(m_matrix))
{
// apply the permutation inplace
Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
mask.fill(false);
int r = 0;
while(r < m_permutation.size())
{
// search for the next seed
while(r<m_permutation.size() && mask[r]) r++;
if(r>=m_permutation.size())
break;
// we got one, let's follow it until we are back to the seed
int k0 = r++;
int kPrev = k0;
mask.coeffRef(k0) = true;
for(int k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
{
Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
.swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
(dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
mask.coeffRef(k) = true;
kPrev = k;
}
}
}
else
{
for(int i = 0; i < n; ++i) for(int i = 0; i < n; ++i)
{ {
Block< Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
Dest, (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime,
Side==OnTheRight ? 1 : Dest::ColsAtCompileTime
>(dst, Side==OnTheLeft ? m_permutation.indices().coeff(i) : i)
= =
Block< Block<MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
MatrixTypeNestedCleaned, (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime, }
Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime
>(m_matrix, Side==OnTheRight ? m_permutation.indices().coeff(i) : i);
} }
} }
@ -322,4 +374,78 @@ struct ei_permut_matrix_product_retval
const typename MatrixType::Nested m_matrix; const typename MatrixType::Nested m_matrix;
}; };
/* Template partial specialization for transposed/inverse permutations */
template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
struct ei_traits<Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> > >
: ei_traits<Matrix<int,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
{};
template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
class Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> >
: public EigenBase<Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> > >
{
typedef PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> PermutationType;
typedef typename PermutationType::IndicesType IndicesType;
public:
#ifndef EIGEN_PARSED_BY_DOXYGEN
typedef ei_traits<PermutationType> Traits;
typedef Matrix<int,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime>
DenseMatrixType;
enum {
Flags = Traits::Flags,
CoeffReadCost = Traits::CoeffReadCost,
RowsAtCompileTime = Traits::RowsAtCompileTime,
ColsAtCompileTime = Traits::ColsAtCompileTime,
MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
};
typedef typename Traits::Scalar Scalar;
#endif
Transpose(const PermutationType& p) : m_permutation(p) {}
inline int rows() const { return m_permutation.rows(); }
inline int cols() const { return m_permutation.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename DenseDerived>
void evalTo(MatrixBase<DenseDerived>& other) const
{
other.setZero();
for (int i=0; i<rows();++i)
other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
}
#endif
/** \return the equivalent permutation matrix */
PermutationType eval() const { return *this; }
DenseMatrixType toDenseMatrix() const { return *this; }
/** \returns the matrix with the inverse permutation applied to the columns.
*/
template<typename Derived> friend
inline const ei_permut_matrix_product_retval<PermutationType, Derived, OnTheRight, true>
operator*(const MatrixBase<Derived>& matrix, const Transpose& trPerm)
{
return ei_permut_matrix_product_retval<PermutationType, Derived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
}
/** \returns the matrix with the inverse permutation applied to the rows.
*/
template<typename Derived>
inline const ei_permut_matrix_product_retval<PermutationType, Derived, OnTheLeft, true>
operator*(const MatrixBase<Derived>& matrix) const
{
return ei_permut_matrix_product_retval<PermutationType, Derived, OnTheLeft, true>(m_permutation, matrix.derived());
}
const PermutationType& nestedPermutation() const { return m_permutation; }
protected:
const PermutationType& m_permutation;
};
#endif // EIGEN_PERMUTATIONMATRIX_H #endif // EIGEN_PERMUTATIONMATRIX_H

View File

@ -84,7 +84,7 @@ public:
* based on the three dimensions of the product. * based on the three dimensions of the product.
* This is a compile time mapping from {1,Small,Large}^3 -> {product types} */ * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
// FIXME I'm not sure the current mapping is the ideal one. // FIXME I'm not sure the current mapping is the ideal one.
template<int Rows, int Cols> struct ei_product_type_selector<Rows, Cols, 1> { enum { ret = OuterProduct }; }; template<int M, int N> struct ei_product_type_selector<M,N,1> { enum { ret = OuterProduct }; };
template<int Depth> struct ei_product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; }; template<int Depth> struct ei_product_type_selector<1, 1, Depth> { enum { ret = InnerProduct }; };
template<> struct ei_product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; }; template<> struct ei_product_type_selector<1, 1, 1> { enum { ret = InnerProduct }; };
template<> struct ei_product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; }; template<> struct ei_product_type_selector<Small,1, Small> { enum { ret = CoeffBasedProductMode }; };
@ -93,12 +93,12 @@ template<> struct ei_product_type_selector<Small,Small,Small>
template<> struct ei_product_type_selector<Small, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template<> struct ei_product_type_selector<Small, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<> struct ei_product_type_selector<Small, Large, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template<> struct ei_product_type_selector<Small, Large, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<> struct ei_product_type_selector<Large, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; }; template<> struct ei_product_type_selector<Large, Small, 1> { enum { ret = LazyCoeffBasedProductMode }; };
template<> struct ei_product_type_selector<1, Large,Small> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1, Large,Small> { enum { ret = CoeffBasedProductMode }; };
template<> struct ei_product_type_selector<1, Large,Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1, Large,Large> { enum { ret = GemvProduct }; };
template<> struct ei_product_type_selector<1, Small,Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<1, Small,Large> { enum { ret = CoeffBasedProductMode }; };
template<> struct ei_product_type_selector<Large,1, Small> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<Large,1, Small> { enum { ret = CoeffBasedProductMode }; };
template<> struct ei_product_type_selector<Large,1, Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<Large,1, Large> { enum { ret = GemvProduct }; };
template<> struct ei_product_type_selector<Small,1, Large> { enum { ret = GemvProduct }; }; template<> struct ei_product_type_selector<Small,1, Large> { enum { ret = CoeffBasedProductMode }; };
template<> struct ei_product_type_selector<Small,Small,Large> { enum { ret = GemmProduct }; }; template<> struct ei_product_type_selector<Small,Small,Large> { enum { ret = GemmProduct }; };
template<> struct ei_product_type_selector<Large,Small,Large> { enum { ret = GemmProduct }; }; template<> struct ei_product_type_selector<Large,Small,Large> { enum { ret = GemmProduct }; };
template<> struct ei_product_type_selector<Small,Large,Large> { enum { ret = GemmProduct }; }; template<> struct ei_product_type_selector<Small,Large,Large> { enum { ret = GemmProduct }; };
@ -298,7 +298,7 @@ struct ei_gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
{ {
Transpose<Dest> destT(dest); Transpose<Dest> destT(dest);
ei_gemv_selector<OnTheRight,!StorageOrder,BlasCompatible> ei_gemv_selector<OnTheRight,!StorageOrder,BlasCompatible>
::run(GeneralProduct<Transpose<typename ProductType::_RhsNested>,Transpose<typename ProductType::_LhsNested> > ::run(GeneralProduct<Transpose<typename ProductType::_RhsNested>,Transpose<typename ProductType::_LhsNested>, GemvProduct>
(prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha); (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
} }
}; };
@ -336,7 +336,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
ei_cache_friendly_product_colmajor_times_vector ei_cache_friendly_product_colmajor_times_vector
<LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>( <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
dest.size(), dest.size(),
&actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
actualRhs, actualDest, actualAlpha); actualRhs, actualDest, actualAlpha);
if (!EvalToDest) if (!EvalToDest)
@ -381,7 +381,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
ei_cache_friendly_product_rowmajor_times_vector ei_cache_friendly_product_rowmajor_times_vector
<LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>( <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
&actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(), &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
rhs_data, prod.rhs().size(), dest, actualAlpha); rhs_data, prod.rhs().size(), dest, actualAlpha);
if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size()); if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
@ -427,6 +427,10 @@ template<typename OtherDerived>
inline const typename ProductReturnType<Derived,OtherDerived>::Type inline const typename ProductReturnType<Derived,OtherDerived>::Type
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
{ {
// A note regarding the function declaration: In MSVC, this function will sometimes
// not be inlined since ei_matrix_storage is an unwindable object for dynamic
// matrices and product types are holding a member to store the result.
// Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
enum { enum {
ProductIsValid = Derived::ColsAtCompileTime==Dynamic ProductIsValid = Derived::ColsAtCompileTime==Dynamic
|| OtherDerived::RowsAtCompileTime==Dynamic || OtherDerived::RowsAtCompileTime==Dynamic

View File

@ -40,7 +40,7 @@ struct ei_redux_traits
private: private:
enum { enum {
PacketSize = ei_packet_traits<typename Derived::Scalar>::size, PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
InnerMaxSize = int(Derived::Flags)&RowMajorBit InnerMaxSize = int(Derived::IsRowMajor)
? Derived::MaxColsAtCompileTime ? Derived::MaxColsAtCompileTime
: Derived::MaxRowsAtCompileTime : Derived::MaxRowsAtCompileTime
}; };
@ -100,15 +100,15 @@ template<typename Func, typename Derived, int Start>
struct ei_redux_novec_unroller<Func, Derived, Start, 1> struct ei_redux_novec_unroller<Func, Derived, Start, 1>
{ {
enum { enum {
col = Start / Derived::RowsAtCompileTime, outer = Start / Derived::InnerSizeAtCompileTime,
row = Start % Derived::RowsAtCompileTime inner = Start % Derived::InnerSizeAtCompileTime
}; };
typedef typename Derived::Scalar Scalar; typedef typename Derived::Scalar Scalar;
EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func&) EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func&)
{ {
return mat.coeff(row, col); return mat.coeffByOuterInner(outer, inner);
} }
}; };
@ -148,12 +148,8 @@ struct ei_redux_vec_unroller<Func, Derived, Start, 1>
{ {
enum { enum {
index = Start * ei_packet_traits<typename Derived::Scalar>::size, index = Start * ei_packet_traits<typename Derived::Scalar>::size,
row = int(Derived::Flags)&RowMajorBit outer = index / int(Derived::InnerSizeAtCompileTime),
? index / int(Derived::ColsAtCompileTime) inner = index % int(Derived::InnerSizeAtCompileTime),
: index % Derived::RowsAtCompileTime,
col = int(Derived::Flags)&RowMajorBit
? index % int(Derived::ColsAtCompileTime)
: index / Derived::RowsAtCompileTime,
alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
}; };
@ -162,7 +158,7 @@ struct ei_redux_vec_unroller<Func, Derived, Start, 1>
EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func&) EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func&)
{ {
return mat.template packet<alignment>(row, col); return mat.template packetByOuterInner<alignment>(outer, inner);
} }
}; };
@ -184,12 +180,12 @@ struct ei_redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
{ {
ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix"); ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
Scalar res; Scalar res;
res = mat.coeff(0, 0); res = mat.coeffByOuterInner(0, 0);
for(int i = 1; i < mat.rows(); ++i) for(int i = 1; i < mat.innerSize(); ++i)
res = func(res, mat.coeff(i, 0)); res = func(res, mat.coeffByOuterInner(0, i));
for(int j = 1; j < mat.cols(); ++j) for(int i = 1; i < mat.outerSize(); ++i)
for(int i = 0; i < mat.rows(); ++i) for(int j = 0; j < mat.innerSize(); ++j)
res = func(res, mat.coeff(i, j)); res = func(res, mat.coeffByOuterInner(i, j));
return res; return res;
} }
}; };
@ -253,8 +249,7 @@ struct ei_redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
const int innerSize = mat.innerSize(); const int innerSize = mat.innerSize();
const int outerSize = mat.outerSize(); const int outerSize = mat.outerSize();
enum { enum {
packetSize = ei_packet_traits<Scalar>::size, packetSize = ei_packet_traits<Scalar>::size
isRowMajor = Derived::Flags&RowMajorBit?1:0
}; };
const int packetedInnerSize = ((innerSize)/packetSize)*packetSize; const int packetedInnerSize = ((innerSize)/packetSize)*packetSize;
Scalar res; Scalar res;
@ -263,13 +258,12 @@ struct ei_redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
PacketScalar packet_res = mat.template packet<Unaligned>(0,0); PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
for(int j=0; j<outerSize; ++j) for(int j=0; j<outerSize; ++j)
for(int i=0; i<packetedInnerSize; i+=int(packetSize)) for(int i=0; i<packetedInnerSize; i+=int(packetSize))
packet_res = func.packetOp(packet_res, mat.template packet<Unaligned> packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
(isRowMajor?j:i, isRowMajor?i:j));
res = func.predux(packet_res); res = func.predux(packet_res);
for(int j=0; j<outerSize; ++j) for(int j=0; j<outerSize; ++j)
for(int i=packetedInnerSize; i<innerSize; ++i) for(int i=packetedInnerSize; i<innerSize; ++i)
res = func(res, mat.coeff(isRowMajor?j:i, isRowMajor?i:j)); res = func(res, mat.coeffByOuterInner(j,i));
} }
else // too small to vectorize anything. else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.

View File

@ -34,14 +34,9 @@ struct ei_traits<ReturnByValue<Derived> >
: public ei_traits<typename ei_traits<Derived>::ReturnType> : public ei_traits<typename ei_traits<Derived>::ReturnType>
{ {
enum { enum {
// FIXME had to remove the DirectAccessBit for usage like // We're disabling the DirectAccess because e.g. the constructor of
// matrix.inverse().block(...) // the Block-with-DirectAccess expression requires to have a coeffRef method.
// because the Block ctor with direct access // Also, we don't want to have to implement the stride stuff.
// wants to call coeffRef() to get an address, and that fails (infinite recursion) as ReturnByValue
// doesnt implement coeffRef().
// The fact that I had to do that shows that when doing xpr.block() with a non-direct-access xpr,
// even if xpr has the EvalBeforeNestingBit, the block() doesn't use direct access on the evaluated
// xpr.
Flags = (ei_traits<typename ei_traits<Derived>::ReturnType>::Flags Flags = (ei_traits<typename ei_traits<Derived>::ReturnType>::Flags
| EvalBeforeNestingBit) & ~DirectAccessBit | EvalBeforeNestingBit) & ~DirectAccessBit
}; };

View File

@ -75,7 +75,8 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
inline int stride() const { return m_matrix.stride(); } inline int outerStride() const { return m_matrix.outerStride(); }
inline int innerStride() const { return m_matrix.innerStride(); }
/** \sa MatrixBase::coeff() /** \sa MatrixBase::coeff()
* \warning the coordinates must fit into the referenced triangular part * \warning the coordinates must fit into the referenced triangular part

View File

@ -57,7 +57,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
inline int stride() const { return m_matrix.stride(); } inline int outerStride() const { return m_matrix.outerStride(); }
inline int innerStride() const { return m_matrix.innerStride(); }
inline const Scalar* data() const { return m_matrix.data(); } inline const Scalar* data() const { return m_matrix.data(); }
// note that this function is needed by assign to correctly align loads/stores // note that this function is needed by assign to correctly align loads/stores

View File

@ -82,7 +82,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth); VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);
ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>( ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
&(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.stride(), &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
&(other.coeffRef(startCol)), r, &(other.coeffRef(startCol)), r,
target, Scalar(-1)); target, Scalar(-1));
} }
@ -147,7 +147,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor
// 2 - it is slighlty faster at runtime // 2 - it is slighlty faster at runtime
ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>( ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
r, r,
&(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.stride(), &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
other.segment(startBlock, actualPanelWidth), other.segment(startBlock, actualPanelWidth),
&(other.coeffRef(endBlock, 0)), &(other.coeffRef(endBlock, 0)),
Scalar(-1)); Scalar(-1));
@ -183,7 +183,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,StorageOrder,
const ActualLhsType actualLhs = LhsProductTraits::extract(lhs); const ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
ei_triangular_solve_matrix<Scalar,Side,Mode,LhsProductTraits::NeedToConjugate,StorageOrder, ei_triangular_solve_matrix<Scalar,Side,Mode,LhsProductTraits::NeedToConjugate,StorageOrder,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor> (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
::run(lhs.rows(), Side==OnTheLeft? rhs.cols() : rhs.rows(), &actualLhs.coeff(0,0), actualLhs.stride(), &rhs.coeffRef(0,0), rhs.stride()); ::run(lhs.rows(), Side==OnTheLeft? rhs.cols() : rhs.rows(), &actualLhs.coeff(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride());
} }
}; };

113
Eigen/src/Core/Stride.h Normal file
View File

@ -0,0 +1,113 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_STRIDE_H
#define EIGEN_STRIDE_H
/** \class Stride
*
* \brief Holds strides information for Map
*
* This class holds the strides information for mapping arrays with strides with class Map.
*
* It holds two values: the inner stride and the outer stride.
*
* The inner stride is the pointer increment between two consecutive entries within a given row of a
* row-major matrix or within a given column of a column-major matrix.
*
* The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
* between two consecutive columns of a column-major matrix.
*
* These two values can be passed either at compile-time as template parameters, or at runtime as
* arguments to the constructor.
*
* Indeed, this class takes two template parameters:
* \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
* \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
*
* \include Map_general_stride.cpp
* Output: \verbinclude Map_general_stride.out
*
* \sa class InnerStride, class OuterStride
*/
template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
class Stride
{
public:
enum {
InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
OuterStrideAtCompileTime = _OuterStrideAtCompileTime
};
/** Default constructor, for use when strides are fixed at compile time */
Stride()
: m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
{
ei_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
}
/** Constructor allowing to pass the strides at runtime */
Stride(int outerStride, int innerStride)
: m_outer(outerStride), m_inner(innerStride)
{
ei_assert(innerStride>=0 && outerStride>=0);
}
/** Copy constructor */
Stride(const Stride& other)
: m_outer(other.outer()), m_inner(other.inner())
{}
/** \returns the outer stride */
inline int outer() const { return m_outer.value(); }
/** \returns the inner stride */
inline int inner() const { return m_inner.value(); }
protected:
ei_int_if_dynamic<OuterStrideAtCompileTime> m_outer;
ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
};
/** \brief Convenience specialization of Stride to specify only an inner stride */
template<int Value>
class InnerStride : public Stride<0, Value>
{
typedef Stride<0, Value> Base;
public:
InnerStride() : Base() {}
InnerStride(int v) : Base(0, v) {}
};
/** \brief Convenience specialization of Stride to specify only an outer stride */
template<int Value>
class OuterStride : public Stride<Value, 0>
{
typedef Stride<Value, 0> Base;
public:
OuterStride() : Base() {}
OuterStride(int v) : Base(v,0) {}
};
#endif // EIGEN_STRIDE_H

View File

@ -47,7 +47,8 @@ template<typename ExpressionType> class SwapWrapper
inline int rows() const { return m_expression.rows(); } inline int rows() const { return m_expression.rows(); }
inline int cols() const { return m_expression.cols(); } inline int cols() const { return m_expression.cols(); }
inline int stride() const { return m_expression.stride(); } inline int outerStride() const { return m_expression.outerStride(); }
inline int innerStride() const { return m_expression.innerStride(); }
inline Scalar& coeffRef(int row, int col) inline Scalar& coeffRef(int row, int col)
{ {
@ -60,7 +61,7 @@ template<typename ExpressionType> class SwapWrapper
} }
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other) void copyCoeff(int row, int col, const DenseBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(row >= 0 && row < rows() ei_internal_assert(row >= 0 && row < rows()
@ -71,7 +72,7 @@ template<typename ExpressionType> class SwapWrapper
} }
template<typename OtherDerived> template<typename OtherDerived>
void copyCoeff(int index, const MatrixBase<OtherDerived>& other) void copyCoeff(int index, const DenseBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(index >= 0 && index < m_expression.size()); ei_internal_assert(index >= 0 && index < m_expression.size());
@ -81,7 +82,7 @@ template<typename ExpressionType> class SwapWrapper
} }
template<typename OtherDerived, int StoreMode, int LoadMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other) void copyPacket(int row, int col, const DenseBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(row >= 0 && row < rows() ei_internal_assert(row >= 0 && row < rows()
@ -94,7 +95,7 @@ template<typename ExpressionType> class SwapWrapper
} }
template<typename OtherDerived, int StoreMode, int LoadMode> template<typename OtherDerived, int StoreMode, int LoadMode>
void copyPacket(int index, const MatrixBase<OtherDerived>& other) void copyPacket(int index, const DenseBase<OtherDerived>& other)
{ {
OtherDerived& _other = other.const_cast_derived(); OtherDerived& _other = other.const_cast_derived();
ei_internal_assert(index >= 0 && index < m_expression.size()); ei_internal_assert(index >= 0 && index < m_expression.size());

View File

@ -93,7 +93,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
typedef typename MatrixType::template MakeBase<Transpose<MatrixType> >::Type Base; typedef typename MatrixType::template MakeBase<Transpose<MatrixType> >::Type Base;
EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>) EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
inline int stride() const { return derived().nestedExpression().stride(); } inline int innerStride() const { return derived().nestedExpression().innerStride(); }
inline int outerStride() const { return derived().nestedExpression().outerStride(); }
inline Scalar* data() { return derived().nestedExpression().data(); } inline Scalar* data() { return derived().nestedExpression().data(); }
inline const Scalar* data() const { return derived().nestedExpression().data(); } inline const Scalar* data() const { return derived().nestedExpression().data(); }
@ -295,25 +296,6 @@ struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr> >
static inline const XprType extract(const XprType& x) { return x; } static inline const XprType extract(const XprType& x) { return x; }
}; };
template<typename T, int Access=ei_blas_traits<T>::ActualAccess>
struct ei_extract_data_selector {
static typename T::Scalar* run(const T& m)
{
return &ei_blas_traits<T>::extract(m).const_cast_derived().coeffRef(0,0);
}
};
template<typename T>
struct ei_extract_data_selector<T,NoDirectAccess> {
static typename T::Scalar* run(const T&) { return 0; }
};
template<typename T> typename T::Scalar* ei_extract_data(const T& m)
{
return ei_extract_data_selector<T>::run(m);
}
template<typename Scalar, bool DestIsTranposed, typename OtherDerived> template<typename Scalar, bool DestIsTranposed, typename OtherDerived>
struct ei_check_transpose_aliasing_selector struct ei_check_transpose_aliasing_selector
{ {

View File

@ -50,7 +50,8 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
inline int rows() const { return derived().rows(); } inline int rows() const { return derived().rows(); }
inline int cols() const { return derived().cols(); } inline int cols() const { return derived().cols(); }
inline int stride() const { return derived().stride(); } inline int outerStride() const { return derived().outerStride(); }
inline int innerStride() const { return derived().innerStride(); }
inline Scalar coeff(int row, int col) const { return derived().coeff(row,col); } inline Scalar coeff(int row, int col) const { return derived().coeff(row,col); }
inline Scalar& coeffRef(int row, int col) { return derived().coeffRef(row,col); } inline Scalar& coeffRef(int row, int col) { return derived().coeffRef(row,col); }
@ -165,7 +166,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
inline int rows() const { return m_matrix.rows(); } inline int rows() const { return m_matrix.rows(); }
inline int cols() const { return m_matrix.cols(); } inline int cols() const { return m_matrix.cols(); }
inline int stride() const { return m_matrix.stride(); } inline int outerStride() const { return m_matrix.outerStride(); }
inline int innerStride() const { return m_matrix.innerStride(); }
/** \sa MatrixBase::operator+=() */ /** \sa MatrixBase::operator+=() */
template<typename Other> TriangularView& operator+=(const Other& other) { return *this = m_matrix + other; } template<typename Other> TriangularView& operator+=(const Other& other) { return *this = m_matrix + other; }

View File

@ -86,7 +86,6 @@ template<typename VectorType, int Size> class VectorBlock
IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? start : 0, IsColVector ? 0 : start,
IsColVector ? size : 1, IsColVector ? 1 : size) IsColVector ? size : 1, IsColVector ? 1 : size)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
} }

View File

@ -29,34 +29,79 @@
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif #endif
typedef __vector float v4f; #ifndef EIGEN_HAS_FUSE_CJMADD
typedef __vector int v4i; #define EIGEN_HAS_FUSE_CJMADD 1
typedef __vector unsigned int v4ui; #endif
typedef __vector __bool int v4bi;
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
typedef __vector float Packet4f;
typedef __vector int Packet4i;
typedef __vector unsigned int Packet4ui;
typedef __vector __bool int Packet4bi;
typedef __vector short int Packet8i;
typedef __vector unsigned char Packet16uc;
// We don't want to write the same code all the time, but we need to reuse the constants // We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead // and it doesn't really work to declare them global, so we define macros instead
#define USE_CONST_v0i const v4i v0i = vec_splat_s32(0) #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
#define USE_CONST_v1i const v4i v1i = vec_splat_s32(1) Packet4f ei_p4f_##NAME = (Packet4f) vec_splat_s32(X)
#define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16)
#define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
#define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0) Packet4i ei_p4i_##NAME = vec_splat_s32(X)
#define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1)
#define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
// Define global static constants:
static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
static Packet4i ei_p4i_COUNTDOWN = { 3, 2, 1, 0 };
static Packet16uc ei_p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
static Packet4f ei_p4f_ONE = vec_ctf(ei_p4i_ONE, 0);
static Packet4f ei_p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)ei_p4i_MINUS1, (Packet4ui)ei_p4i_MINUS1);
template<> struct ei_packet_traits<float> : ei_default_packet_traits template<> struct ei_packet_traits<float> : ei_default_packet_traits
{ typedef v4f type; enum {size=4}; }; {
typedef Packet4f type; enum {size=4};
enum {
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
};
};
template<> struct ei_packet_traits<int> : ei_default_packet_traits template<> struct ei_packet_traits<int> : ei_default_packet_traits
{ typedef v4i type; enum {size=4}; }; { typedef Packet4i type; enum {size=4}; };
template<> struct ei_unpacket_traits<v4f> { typedef float type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
template<> struct ei_unpacket_traits<v4i> { typedef int type; enum {size=4}; }; template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
/*
inline std::ostream & operator <<(std::ostream & s, const v4f & v) inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
{ {
union { union {
v4f v; Packet4f v;
float n[4]; float n[4];
} vt; } vt;
vt.v = v; vt.v = v;
@ -64,10 +109,10 @@ inline std::ostream & operator <<(std::ostream & s, const v4f & v)
return s; return s;
} }
inline std::ostream & operator <<(std::ostream & s, const v4i & v) inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
{ {
union { union {
v4i v; Packet4i v;
int n[4]; int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
@ -75,10 +120,10 @@ inline std::ostream & operator <<(std::ostream & s, const v4i & v)
return s; return s;
} }
inline std::ostream & operator <<(std::ostream & s, const v4ui & v) inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
{ {
union { union {
v4ui v; Packet4ui v;
unsigned int n[4]; unsigned int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
@ -86,65 +131,73 @@ inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
return s; return s;
} }
inline std::ostream & operator <<(std::ostream & s, const v4bi & v) inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
{ {
union { union {
__vector __bool int v; Packet4bi v;
unsigned int n[4]; unsigned int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s; return s;
} }
*/
template<> inline v4f ei_padd(const v4f& a, const v4f& b) { return vec_add(a,b); } template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) {
template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add(a,b); } // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float EIGEN_ALIGN16 af[4];
template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); } af[0] = from;
template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); } Packet4f vc = vec_ld(0, af);
vc = vec_splat(vc, 0);
template<> EIGEN_STRONG_INLINE v4f ei_pnegate(const v4f& a) return vc;
{
v4i mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
return vec_xor(a,(v4f) mask);
} }
template<> EIGEN_STRONG_INLINE v4i ei_pnegate(const v4i& a) template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) {
{ int EIGEN_ALIGN16 ai[4];
USE_CONST_v0i; ai[0] = from;
return ei_psub(v0i, a); Packet4i vc = vec_ld(0, ai);
vc = vec_splat(vc, 0);
return vc;
} }
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); } template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); }
template<> inline v4i ei_pmul(const v4i& a, const v4i& b) template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a) { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); }
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return ei_psub<Packet4f>(ei_p4f_ZERO, a); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return ei_psub<Packet4i>(ei_p4i_ZERO, a); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,ei_p4f_ZERO); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
{ {
// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
//Set up constants, variables //Set up constants, variables
v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
USE_CONST_v0i;
USE_CONST_v1i;
USE_CONST_v16i_;
// Get the absolute values // Get the absolute values
a1 = vec_abs(a); a1 = vec_abs(a);
b1 = vec_abs(b); b1 = vec_abs(b);
// Get the signs using xor // Get the signs using xor
v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i); Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), ei_p4i_ZERO);
// Do the multiplication for the asbolute values. // Do the multiplication for the asbolute values.
bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ ); bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) ei_p4i_MINUS16 );
low_prod = vec_mulo((__vector short)a1, (__vector short)b1); low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
high_prod = vec_msum((__vector short)a1, (__vector short)bswap, v0i); high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, ei_p4i_ZERO);
high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_); high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) ei_p4i_MINUS16);
prod = vec_add( low_prod, high_prod ); prod = vec_add( low_prod, high_prod );
// NOR the product and select only the negative elements according to the sign mask // NOR the product and select only the negative elements according to the sign mask
prod_ = vec_nor(prod, prod); prod_ = vec_nor(prod, prod);
prod_ = vec_sel(v0i, prod_, sgn); prod_ = vec_sel(ei_p4i_ZERO, prod_, sgn);
// Add 1 to the result to get the negative numbers // Add 1 to the result to get the negative numbers
v1sel = vec_sel(v0i, v1i, sgn); v1sel = vec_sel(ei_p4i_ZERO, ei_p4i_ONE, sgn);
prod_ = vec_add(prod_, v1sel); prod_ = vec_add(prod_, v1sel);
// Merge the results back to the final vector. // Merge the results back to the final vector.
@ -152,282 +205,266 @@ template<> inline v4i ei_pmul(const v4i& a, const v4i& b)
return prod; return prod;
} }
template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) { {
v4f t, y_0, y_1, res; Packet4f t, y_0, y_1, res;
USE_CONST_v0f;
USE_CONST_v1f;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
y_0 = vec_re(b); y_0 = vec_re(b);
// Do one Newton-Raphson iteration to get the needed accuracy // Do one Newton-Raphson iteration to get the needed accuracy
t = vec_nmsub(y_0, b, v1f); t = vec_nmsub(y_0, b, ei_p4f_ONE);
y_1 = vec_madd(y_0, t, y_0); y_1 = vec_madd(y_0, t, y_0);
res = vec_madd(a, y_1, v0f); res = vec_madd(a, y_1, ei_p4f_ZERO);
return res; return res;
} }
template<> inline v4f ei_pmadd(const v4f& a, const v4f& b, const v4f& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ ei_assert(false && "packet integer division are not supported by AltiVec");
return ei_pset1<int>(0);
}
template<> inline v4f ei_pmin(const v4f& a, const v4f& b) { return vec_min(a,b); } // for some weird raisons, it has to be overloaded for packet of integers
template<> inline v4i ei_pmin(const v4i& a, const v4i& b) { return vec_min(a,b); } template<> EIGEN_STRONG_INLINE Packet4f ei_pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
template<> inline v4f ei_pmax(const v4f& a, const v4f& b) { return vec_max(a,b); } template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
template<> inline v4i ei_pmax(const v4i& a, const v4i& b) { return vec_max(a,b); } template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE v4f ei_pabs(const v4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE v4i ei_pabs(const v4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> inline v4f ei_pload(const float* from) { return vec_ld(0, from); } // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> inline v4i ei_pload(const int* from) { return vec_ld(0, from); } template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
template<> inline v4f ei_ploadu(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
{ {
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
__vector unsigned char MSQ, LSQ; Packet16uc MSQ, LSQ;
__vector unsigned char mask; Packet16uc mask;
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
mask = vec_lvsl(0, from); // create the permute mask mask = vec_lvsl(0, from); // create the permute mask
return (v4f) vec_perm(MSQ, LSQ, mask); // align the data return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
}
template<> inline v4i ei_ploadu(const int* from) }
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
{ {
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
__vector unsigned char MSQ, LSQ; Packet16uc MSQ, LSQ;
__vector unsigned char mask; Packet16uc mask;
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
mask = vec_lvsl(0, from); // create the permute mask mask = vec_lvsl(0, from); // create the permute mask
return (v4i) vec_perm(MSQ, LSQ, mask); // align the data return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
} }
template<> inline v4f ei_pset1(const float& from) template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
{ template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
float __attribute__(aligned(16)) af[4];
af[0] = from;
v4f vc = vec_ld(0, af);
vc = vec_splat(vc, 0);
return vc;
}
template<> inline v4i ei_pset1(const int& from) template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from)
{
int __attribute__(aligned(16)) ai[4];
ai[0] = from;
v4i vc = vec_ld(0, ai);
vc = vec_splat(vc, 0);
return vc;
}
template<> inline void ei_pstore(float* to, const v4f& from) { vec_st(from, 0, to); }
template<> inline void ei_pstore(int* to, const v4i& from) { vec_st(from, 0, to); }
template<> inline void ei_pstoreu(float* to, const v4f& from)
{ {
EIGEN_DEBUG_UNALIGNED_STORE
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
// Warning: not thread safe! // Warning: not thread safe!
__vector unsigned char MSQ, LSQ, edges; Packet16uc MSQ, LSQ, edges;
__vector unsigned char edgeAlign, align; Packet16uc edgeAlign, align;
MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
edgeAlign = vec_lvsl(0, to); // permute map to extract edges edgeAlign = vec_lvsl(0, to); // permute map to extract edges
edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
align = vec_lvsr( 0, to ); // permute map to misalign data align = vec_lvsr( 0, to ); // permute map to misalign data
MSQ = vec_perm(edges,(__vector unsigned char)from,align); // misalign the data (MSQ) MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
LSQ = vec_perm((__vector unsigned char)from,edges,align); // misalign the data (LSQ) LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
} }
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from)
template<> inline void ei_pstoreu(int* to , const v4i& from )
{ {
EIGEN_DEBUG_UNALIGNED_STORE
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
// Warning: not thread safe! // Warning: not thread safe!
__vector unsigned char MSQ, LSQ, edges; Packet16uc MSQ, LSQ, edges;
__vector unsigned char edgeAlign, align; Packet16uc edgeAlign, align;
MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
edgeAlign = vec_lvsl(0, to); // permute map to extract edges edgeAlign = vec_lvsl(0, to); // permute map to extract edges
edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
align = vec_lvsr( 0, to ); // permute map to misalign data align = vec_lvsr( 0, to ); // permute map to misalign data
MSQ = vec_perm(edges,(__vector unsigned char)from,align); // misalign the data (MSQ) MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
LSQ = vec_perm((__vector unsigned char)from,edges,align); // misalign the data (LSQ) LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
} }
template<> inline float ei_pfirst(const v4f& a) template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, ei_p16uc_REVERSE); }
template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, ei_p16uc_REVERSE); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vec_abs(a); }
template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
{ {
float EIGEN_ALIGN16 af[4]; Packet4f b, sum;
vec_st(a, 0, af); b = (Packet4f) vec_sld(a, a, 8);
return af[0];
}
template<> inline int ei_pfirst(const v4i& a)
{
int EIGEN_ALIGN16 ai[4];
vec_st(a, 0, ai);
return ai[0];
}
template<> EIGEN_STRONG_INLINE v4f ei_preverse(const v4f& a)
{
static const __vector unsigned char reverse_mask =
{12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
return (v4f)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
}
template<> EIGEN_STRONG_INLINE v4i ei_preverse(const v4i& a)
{
static const __vector unsigned char __attribute__(aligned(16)) reverse_mask =
{12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
return (v4i)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
}
inline v4f ei_preduxp(const v4f* vecs)
{
v4f v[4], sum[4];
// It's easier and faster to transpose then add as columns
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
// Do the transpose, first set of moves
v[0] = vec_mergeh(vecs[0], vecs[2]);
v[1] = vec_mergel(vecs[0], vecs[2]);
v[2] = vec_mergeh(vecs[1], vecs[3]);
v[3] = vec_mergel(vecs[1], vecs[3]);
// Get the resulting vectors
sum[0] = vec_mergeh(v[0], v[2]);
sum[1] = vec_mergel(v[0], v[2]);
sum[2] = vec_mergeh(v[1], v[3]);
sum[3] = vec_mergel(v[1], v[3]);
// Now do the summation:
// Lines 0+1
sum[0] = vec_add(sum[0], sum[1]);
// Lines 2+3
sum[1] = vec_add(sum[2], sum[3]);
// Add the results
sum[0] = vec_add(sum[0], sum[1]);
return sum[0];
}
inline v4i ei_preduxp(const v4i* vecs)
{
v4i v[4], sum[4];
// It's easier and faster to transpose then add as columns
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
// Do the transpose, first set of moves
v[0] = vec_mergeh(vecs[0], vecs[2]);
v[1] = vec_mergel(vecs[0], vecs[2]);
v[2] = vec_mergeh(vecs[1], vecs[3]);
v[3] = vec_mergel(vecs[1], vecs[3]);
// Get the resulting vectors
sum[0] = vec_mergeh(v[0], v[2]);
sum[1] = vec_mergel(v[0], v[2]);
sum[2] = vec_mergeh(v[1], v[3]);
sum[3] = vec_mergel(v[1], v[3]);
// Now do the summation:
// Lines 0+1
sum[0] = vec_add(sum[0], sum[1]);
// Lines 2+3
sum[1] = vec_add(sum[2], sum[3]);
// Add the results
sum[0] = vec_add(sum[0], sum[1]);
return sum[0];
}
inline float ei_predux(const v4f& a)
{
v4f b, sum;
b = (v4f)vec_sld(a, a, 8);
sum = vec_add(a, b); sum = vec_add(a, b);
b = (v4f)vec_sld(sum, sum, 4); b = (Packet4f) vec_sld(sum, sum, 4);
sum = vec_add(sum, b); sum = vec_add(sum, b);
return ei_pfirst(sum); return ei_pfirst(sum);
} }
inline int ei_predux(const v4i& a) template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
{ {
USE_CONST_v0i; Packet4f v[4], sum[4];
v4i sum;
sum = vec_sums(a, v0i); // It's easier and faster to transpose then add as columns
sum = vec_sld(sum, v0i, 12); // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
// Do the transpose, first set of moves
v[0] = vec_mergeh(vecs[0], vecs[2]);
v[1] = vec_mergel(vecs[0], vecs[2]);
v[2] = vec_mergeh(vecs[1], vecs[3]);
v[3] = vec_mergel(vecs[1], vecs[3]);
// Get the resulting vectors
sum[0] = vec_mergeh(v[0], v[2]);
sum[1] = vec_mergel(v[0], v[2]);
sum[2] = vec_mergeh(v[1], v[3]);
sum[3] = vec_mergel(v[1], v[3]);
// Now do the summation:
// Lines 0+1
sum[0] = vec_add(sum[0], sum[1]);
// Lines 2+3
sum[1] = vec_add(sum[2], sum[3]);
// Add the results
sum[0] = vec_add(sum[0], sum[1]);
return sum[0];
}
template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
{
Packet4i sum;
sum = vec_sums(a, ei_p4i_ZERO);
sum = vec_sld(sum, ei_p4i_ZERO, 12);
return ei_pfirst(sum); return ei_pfirst(sum);
} }
// implement other reductions operators template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
inline float ei_predux_mul(const v4f& a)
{ {
v4f prod; Packet4i v[4], sum[4];
prod = ei_pmul(a, (v4f)vec_sld(a, a, 8));
return ei_pfirst(ei_pmul(prod, (v4f)vec_sld(prod, prod, 4))); // It's easier and faster to transpose then add as columns
// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
// Do the transpose, first set of moves
v[0] = vec_mergeh(vecs[0], vecs[2]);
v[1] = vec_mergel(vecs[0], vecs[2]);
v[2] = vec_mergeh(vecs[1], vecs[3]);
v[3] = vec_mergel(vecs[1], vecs[3]);
// Get the resulting vectors
sum[0] = vec_mergeh(v[0], v[2]);
sum[1] = vec_mergel(v[0], v[2]);
sum[2] = vec_mergeh(v[1], v[3]);
sum[3] = vec_mergel(v[1], v[3]);
// Now do the summation:
// Lines 0+1
sum[0] = vec_add(sum[0], sum[1]);
// Lines 2+3
sum[1] = vec_add(sum[2], sum[3]);
// Add the results
sum[0] = vec_add(sum[0], sum[1]);
return sum[0];
} }
inline int ei_predux_mul(const v4i& a) // Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
{
Packet4f prod;
prod = ei_pmul(a, (Packet4f)vec_sld(a, a, 8));
return ei_pfirst(ei_pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
}
template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
{ {
EIGEN_ALIGN16 int aux[4]; EIGEN_ALIGN16 int aux[4];
ei_pstore(aux, a); ei_pstore(aux, a);
return aux[0] * aux[1] * aux[2] * aux[3]; return aux[0] * aux[1] * aux[2] * aux[3];
} }
inline float ei_predux_min(const v4f& a) // min
template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
{ {
v4f b, res; Packet4f b, res;
b = vec_min(a, vec_sld(a, a, 8)); b = vec_min(a, vec_sld(a, a, 8));
res = vec_min(b, vec_sld(b, b, 4)); res = vec_min(b, vec_sld(b, b, 4));
return ei_pfirst(res); return ei_pfirst(res);
} }
inline int ei_predux_min(const v4i& a) template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
{ {
v4i b, res; Packet4i b, res;
b = vec_min(a, vec_sld(a, a, 8)); b = vec_min(a, vec_sld(a, a, 8));
res = vec_min(b, vec_sld(b, b, 4)); res = vec_min(b, vec_sld(b, b, 4));
return ei_pfirst(res); return ei_pfirst(res);
} }
inline float ei_predux_max(const v4f& a) // max
template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
{ {
v4f b, res; Packet4f b, res;
b = vec_max(a, vec_sld(a, a, 8)); b = vec_max(a, vec_sld(a, a, 8));
res = vec_max(b, vec_sld(b, b, 4)); res = vec_max(b, vec_sld(b, b, 4));
return ei_pfirst(res); return ei_pfirst(res);
} }
inline int ei_predux_max(const v4i& a) template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
{ {
v4i b, res; Packet4i b, res;
b = vec_max(a, vec_sld(a, a, 8)); b = vec_max(a, vec_sld(a, a, 8));
res = vec_max(b, vec_sld(b, b, 4)); res = vec_max(b, vec_sld(b, b, 4));
return ei_pfirst(res); return ei_pfirst(res);
} }
template<int Offset> template<int Offset>
struct ei_palign_impl<Offset, v4f> struct ei_palign_impl<Offset,Packet4f>
{ {
inline static void run(v4f& first, const v4f& second) EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
{ {
if (Offset!=0)
first = vec_sld(first, second, Offset*4); first = vec_sld(first, second, Offset*4);
} }
}; };
template<int Offset> template<int Offset>
struct ei_palign_impl<Offset, v4i> struct ei_palign_impl<Offset,Packet4i>
{ {
inline static void run(v4i& first, const v4i& second) EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
{ {
if (Offset!=0)
first = vec_sld(first, second, Offset*4); first = vec_sld(first, second, Offset*4);
} }
}; };
#endif // EIGEN_PACKET_MATH_ALTIVEC_H #endif // EIGEN_PACKET_MATH_ALTIVEC_H

View File

@ -1,2 +1,3 @@
ADD_SUBDIRECTORY(SSE) ADD_SUBDIRECTORY(SSE)
ADD_SUBDIRECTORY(AltiVec) ADD_SUBDIRECTORY(AltiVec)
ADD_SUBDIRECTORY(NEON)

View File

@ -0,0 +1,78 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
/* All the parameters defined in this file can be specialized in the
* architecture specific files, and/or by the user.
* More to come... */
#ifndef EIGEN_DEFAULT_SETTINGS_H
#define EIGEN_DEFAULT_SETTINGS_H
/** Defines the maximal loop size to enable meta unrolling of loops.
* Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
* it does not correspond to the number of iterations or the number of instructions
*/
#ifndef EIGEN_UNROLLING_LIMIT
#define EIGEN_UNROLLING_LIMIT 100
#endif
/** Defines the threshold between a "small" and a "large" matrix.
* This threshold is mainly used to select the proper product implementation.
*/
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
* The current value is set to generate blocks of 256x256 for float
*
* Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
*/
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
#endif
/** Defines the maximal width of the blocks used in the triangular product and solver
* for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
*/
#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
#endif
/** Defines the default number of registers available for that architecture.
* Currently it must be 8 or 16. Other values will fail.
*/
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#if (defined __i386__)
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#else
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
#endif
#endif
#endif // EIGEN_DEFAULT_SETTINGS_H

View File

@ -0,0 +1,6 @@
FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_arch_NEON_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
)

View File

@ -0,0 +1,378 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
// Heavily based on Gael's SSE version.
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_PACKET_MATH_NEON_H
#define EIGEN_PACKET_MATH_NEON_H
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
// is so bad, it is much better to reduce it to 8
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#endif
typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i;
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
template<> struct ei_packet_traits<float> : ei_default_packet_traits
{
typedef Packet4f type; enum {size=4};
enum {
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
};
};
template<> struct ei_packet_traits<int> : ei_default_packet_traits
{ typedef Packet4i type; enum {size=4}; };
template<> struct ei_unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
template<> struct ei_unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float& from) { return vdupq_n_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int& from) { return vdupq_n_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
{
Packet4f countdown = { 3, 2, 1, 0 };
return vaddq_f32(ei_pset1(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
{
Packet4i countdown = { 3, 2, 1, 0 };
return vaddq_s32(ei_pset1(a), countdown);
}
template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return vnegq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return vnegq_s32(a); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
{
Packet4f inv, restep, div;
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
// However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
// a reciprocal estimate AND a reciprocal step -which saves a few instructions
// vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
// Newton-Raphson and vrecpsq_f32()
inv = vrecpeq_f32(b);
// This returns a differential, by which we will have to multiply inv to get a better
// approximation of 1/b.
restep = vrecpsq_f32(b, inv);
inv = vmulq_f32(restep, inv);
// Finally, multiply a by 1/b and get the wanted result of the division.
div = vmulq_f32(a, inv);
return div;
}
template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
{ ei_assert(false && "packet integer division are not supported by NEON");
return ei_pset1<int>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
{
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
}
template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
// FIXME only store the 2 first elements ?
template<> EIGEN_STRONG_INLINE float ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE int ei_pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) {
float32x2_t a_lo, a_hi;
Packet4f a_r64, a_r128;
a_r64 = vrev64q_f32(a);
a_lo = vget_low_f32(a_r64);
a_hi = vget_high_f32(a_r64);
a_r128 = vcombine_f32(a_hi, a_lo);
return a_r128;
}
template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) {
int32x2_t a_lo, a_hi;
Packet4i a_r64, a_r128;
a_r64 = vrev64q_s32(a);
a_lo = vget_low_s32(a_r64);
a_hi = vget_high_s32(a_r64);
a_r128 = vcombine_s32(a_hi, a_lo);
return a_r128;
}
template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); }
template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, sum;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
sum = vpadd_f32(a_lo, a_hi);
sum = vpadd_f32(sum, sum);
vst1_f32(s, sum);
return s[0];
}
template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
{
float32x4x2_t vtrn1, vtrn2, res1, res2;
Packet4f sum1, sum2, sum;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1 = vzipq_f32(vecs[0], vecs[2]);
vtrn2 = vzipq_f32(vecs[1], vecs[3]);
res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
// Do the addition of the resulting vectors
sum1 = vaddq_f32(res1.val[0], res1.val[1]);
sum2 = vaddq_f32(res2.val[0], res2.val[1]);
sum = vaddq_f32(sum1, sum2);
return sum;
}
template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, sum;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
sum = vpadd_s32(a_lo, a_hi);
sum = vpadd_s32(sum, sum);
vst1_s32(s, sum);
return s[0];
}
template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
{
int32x4x2_t vtrn1, vtrn2, res1, res2;
Packet4i sum1, sum2, sum;
// NEON zip performs interleaving of the supplied vectors.
// We perform two interleaves in a row to acquire the transposed vector
vtrn1 = vzipq_s32(vecs[0], vecs[2]);
vtrn2 = vzipq_s32(vecs[1], vecs[3]);
res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
// Do the addition of the resulting vectors
sum1 = vaddq_s32(res1.val[0], res1.val[1]);
sum2 = vaddq_s32(res2.val[0], res2.val[1]);
sum = vaddq_s32(sum1, sum2);
return sum;
}
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, prod;
float s[2];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod = vmul_f32(a_lo, a_hi);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod = vmul_f32(prod, vrev64_f32(prod));
vst1_f32(s, prod);
return s[0];
}
template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, prod;
int32_t s[2];
// Get a_lo = |a1|a2| and a_hi = |a3|a4|
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
// Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
prod = vmul_s32(a_lo, a_hi);
// Multiply prod with its swapped value |a2*a4|a1*a3|
prod = vmul_s32(prod, vrev64_s32(prod));
vst1_s32(s, prod);
return s[0];
}
// min
template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, min;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
min = vpmin_f32(a_lo, a_hi);
min = vpmin_f32(min, min);
vst1_f32(s, min);
return s[0];
}
template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, min;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
min = vpmin_s32(a_lo, a_hi);
min = vpmin_s32(min, min);
vst1_s32(s, min);
return s[0];
}
// max
template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
{
float32x2_t a_lo, a_hi, max;
float s[2];
a_lo = vget_low_f32(a);
a_hi = vget_high_f32(a);
max = vpmax_f32(a_lo, a_hi);
max = vpmax_f32(max, max);
vst1_f32(s, max);
return s[0];
}
template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
{
int32x2_t a_lo, a_hi, max;
int32_t s[2];
a_lo = vget_low_s32(a);
a_hi = vget_high_s32(a);
max = vpmax_s32(a_lo, a_hi);
max = vpmax_s32(max, max);
vst1_s32(s, max);
return s[0];
}
template<int Offset>
struct ei_palign_impl<Offset,Packet4f>
{
EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
{
if (Offset!=0)
first = vextq_f32(first, second, Offset);
}
};
template<int Offset>
struct ei_palign_impl<Offset,Packet4i>
{
EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
{
if (Offset!=0)
first = vextq_s32(first, second, Offset);
}
};
#endif // EIGEN_PACKET_MATH_NEON_H

View File

@ -122,7 +122,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, con
template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b) template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
{ {
#ifdef __SSE4_1__ #ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_mullo_epi32(a,b); return _mm_mullo_epi32(a,b);
#else #else
// this version is slightly faster than 4 scalar products // this version is slightly faster than 4 scalar products
@ -184,7 +184,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) {
template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
#if (!defined __GNUC__) && (!defined __ICC) #if defined(_MSC_VER)
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); } template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
@ -194,29 +194,30 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_
// because of the strict aliasing rule. The "dummy" stuff are required to enforce // because of the strict aliasing rule. The "dummy" stuff are required to enforce
// a correct instruction dependency. // a correct instruction dependency.
// TODO: do the same for MSVC (ICC is compatible) // TODO: do the same for MSVC (ICC is compatible)
// NOTE: with the code below, MSVC's compiler crashes!
template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
{ {
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
__m128 res; __m128d res;
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); res = _mm_load_sd((const double*)(from)) ;
asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); res = _mm_loadh_pd(res, (const double*)(from+2)) ;
return res; return _mm_castpd_ps(res);
} }
template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from) template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
{ {
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
__m128d res; __m128d res;
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) ); res = _mm_load_sd(from) ;
asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) ); res = _mm_loadh_pd(res,from+1);
return res; return res;
} }
template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
{ {
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
__m128i res; __m128d res;
asm volatile ("movsd %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) ); res = _mm_load_sd((const double*)(from)) ;
asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) ); res = _mm_loadh_pd(res, (const double*)(from+2)) ;
return res; return _mm_castpd_si128(res);
} }
#endif #endif
@ -269,7 +270,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pabs(const Packet2d& a)
} }
template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
{ {
#ifdef __SSSE3__ #ifdef EIGEN_VECTORIZE_SSSE3
return _mm_abs_epi32(a); return _mm_abs_epi32(a);
#else #else
Packet4i aux = _mm_srai_epi32(a,31); Packet4i aux = _mm_srai_epi32(a,31);
@ -277,8 +278,15 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
#endif #endif
} }
EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs)
{
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
}
#ifdef __SSE3__ #ifdef EIGEN_VECTORIZE_SSE3
// TODO implement SSE2 versions as well as integer versions // TODO implement SSE2 versions as well as integer versions
template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs) template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
{ {
@ -439,7 +447,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
// } // }
#endif #endif
#ifdef __SSSE3__ #ifdef EIGEN_VECTORIZE_SSSE3
// SSSE3 versions // SSSE3 versions
template<int Offset> template<int Offset>
struct ei_palign_impl<Offset,Packet4f> struct ei_palign_impl<Offset,Packet4f>

View File

@ -305,10 +305,7 @@ struct ei_product_coeff_vectorized_dyn_selector
{ {
EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{ {
res = ei_dot_impl< res = lhs.row(row).cwiseProduct(rhs.col(col)).sum();
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
LinearVectorizedTraversal, NoUnrolling>::run(lhs.row(row), rhs.col(col));
} }
}; };
@ -319,10 +316,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
{ {
EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{ {
res = ei_dot_impl< res = lhs.cwiseProduct(rhs.col(col)).sum();
Lhs,
Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
LinearVectorizedTraversal, NoUnrolling>::run(lhs, rhs.col(col));
} }
}; };
@ -331,10 +325,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
{ {
EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{ {
res = ei_dot_impl< res = lhs.row(row).cwiseProduct(rhs).sum();
Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
Rhs,
LinearVectorizedTraversal, NoUnrolling>::run(lhs.row(row), rhs);
} }
}; };
@ -343,10 +334,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
{ {
EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
{ {
res = ei_dot_impl< res = lhs.cwiseProduct(rhs).sum();
Lhs,
Rhs,
LinearVectorizedTraversal, NoUnrolling>::run(lhs, rhs);
} }
}; };

View File

@ -30,14 +30,16 @@
#ifdef EIGEN_HAS_FUSE_CJMADD #ifdef EIGEN_HAS_FUSE_CJMADD
#define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C);
#else #else
#define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T); #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
// #define CJMADD(A,B,C,T) T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
#endif #endif
// optimized GEneral packed Block * packed Panel product kernel // optimized GEneral packed Block * packed Panel product kernel
template<typename Scalar, int mr, int nr, typename Conj> template<typename Scalar, int mr, int nr, typename Conj>
struct ei_gebp_kernel struct ei_gebp_kernel
{ {
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0) void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols,
int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0, Scalar* unpackedB = 0)
{ {
typedef typename ei_packet_traits<Scalar>::type PacketType; typedef typename ei_packet_traits<Scalar>::type PacketType;
enum { PacketSize = ei_packet_traits<Scalar>::size }; enum { PacketSize = ei_packet_traits<Scalar>::size };
@ -48,9 +50,67 @@ struct ei_gebp_kernel
const int peeled_mc = (rows/mr)*mr; const int peeled_mc = (rows/mr)*mr;
const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
const int peeled_kc = (depth/4)*4; const int peeled_kc = (depth/4)*4;
if(unpackedB==0)
unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
// loops on each micro vertical panel of rhs (depth x nr) // loops on each micro vertical panel of rhs (depth x nr)
for(int j2=0; j2<packet_cols; j2+=nr) for(int j2=0; j2<packet_cols; j2+=nr)
{ {
// unpack B
{
const Scalar* blB = &blockB[j2*strideB+offsetB*nr];
int n = depth*nr;
for(int k=0; k<n; k++)
ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
/*Scalar* dest = unpackedB;
for(int k=0; k<n; k+=4*PacketSize)
{
#ifdef EIGEN_VECTORIZE_SSE
const int S = 128;
const int G = 16;
_mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0);
_mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0);
_mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0);
_mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0);
_mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
#endif
PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
C0[0] = ei_pload(blB+0*PacketSize);
C1[0] = ei_pload(blB+1*PacketSize);
C2[0] = ei_pload(blB+2*PacketSize);
C3[0] = ei_pload(blB+3*PacketSize);
ei_punpackp(C0);
ei_punpackp(C1);
ei_punpackp(C2);
ei_punpackp(C3);
ei_pstore(dest+ 0*PacketSize, C0[0]);
ei_pstore(dest+ 1*PacketSize, C0[1]);
ei_pstore(dest+ 2*PacketSize, C0[2]);
ei_pstore(dest+ 3*PacketSize, C0[3]);
ei_pstore(dest+ 4*PacketSize, C1[0]);
ei_pstore(dest+ 5*PacketSize, C1[1]);
ei_pstore(dest+ 6*PacketSize, C1[2]);
ei_pstore(dest+ 7*PacketSize, C1[3]);
ei_pstore(dest+ 8*PacketSize, C2[0]);
ei_pstore(dest+ 9*PacketSize, C2[1]);
ei_pstore(dest+10*PacketSize, C2[2]);
ei_pstore(dest+11*PacketSize, C2[3]);
ei_pstore(dest+12*PacketSize, C3[0]);
ei_pstore(dest+13*PacketSize, C3[1]);
ei_pstore(dest+14*PacketSize, C3[2]);
ei_pstore(dest+15*PacketSize, C3[3]);
blB += 4*PacketSize;
dest += 16*PacketSize;
}*/
}
// loops on each micro horizontal panel of lhs (mr x depth) // loops on each micro horizontal panel of lhs (mr x depth)
// => we select a mr x nr micro block of res which is entirely // => we select a mr x nr micro block of res which is entirely
// stored into mr/packet_size x nr registers. // stored into mr/packet_size x nr registers.
@ -65,19 +125,31 @@ struct ei_gebp_kernel
// gets res block as register // gets res block as register
PacketType C0, C1, C2, C3, C4, C5, C6, C7; PacketType C0, C1, C2, C3, C4, C5, C6, C7;
C0 = ei_ploadu(&res[(j2+0)*resStride + i]); C0 = ei_pset1(Scalar(0));
C1 = ei_ploadu(&res[(j2+1)*resStride + i]); C1 = ei_pset1(Scalar(0));
if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); if(nr==4) C2 = ei_pset1(Scalar(0));
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); if(nr==4) C3 = ei_pset1(Scalar(0));
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); C4 = ei_pset1(Scalar(0));
C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]); C5 = ei_pset1(Scalar(0));
if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]); if(nr==4) C6 = ei_pset1(Scalar(0));
if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]); if(nr==4) C7 = ei_pset1(Scalar(0));
Scalar* r0 = &res[(j2+0)*resStride + i];
Scalar* r1 = r0 + resStride;
Scalar* r2 = r1 + resStride;
Scalar* r3 = r2 + resStride;
#ifdef EIGEN_VECTORIZE_SSE
_mm_prefetch((const char*)(r0+16), _MM_HINT_T0);
_mm_prefetch((const char*)(r1+16), _MM_HINT_T0);
_mm_prefetch((const char*)(r2+16), _MM_HINT_T0);
_mm_prefetch((const char*)(r3+16), _MM_HINT_T0);
#endif
// performs "inner" product // performs "inner" product
// TODO let's check wether the flowing peeled loop could not be // TODO let's check wether the folowing peeled loop could not be
// optimized via optimal prefetching from one loop to the other // optimized via optimal prefetching from one loop to the other
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; const Scalar* blB = unpackedB;
for(int k=0; k<peeled_kc; k+=4) for(int k=0; k<peeled_kc; k+=4)
{ {
if(nr==2) if(nr==2)
@ -88,43 +160,42 @@ struct ei_gebp_kernel
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[1*PacketSize]); B0 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C1,T0); CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0); CJMADD(A1,B0,C5,B0);
A0 = ei_pload(&blA[2*PacketSize]); A0 = ei_pload(&blA[2*PacketSize]);
A1 = ei_pload(&blA[3*PacketSize]); A1 = ei_pload(&blA[3*PacketSize]);
B0 = ei_pload(&blB[2*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[3*PacketSize]);
CJMADD(A0,B0,C1,T0); CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0); CJMADD(A1,B0,C5,B0);
A0 = ei_pload(&blA[4*PacketSize]); A0 = ei_pload(&blA[4*PacketSize]);
A1 = ei_pload(&blA[5*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]);
B0 = ei_pload(&blB[4*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[5*PacketSize]); B0 = ei_pload(&blB[5*PacketSize]);
CJMADD(A0,B0,C1,T0); CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0); CJMADD(A1,B0,C5,B0);
A0 = ei_pload(&blA[6*PacketSize]); A0 = ei_pload(&blA[6*PacketSize]);
A1 = ei_pload(&blA[7*PacketSize]); A1 = ei_pload(&blA[7*PacketSize]);
B0 = ei_pload(&blB[6*PacketSize]); B0 = ei_pload(&blB[6*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[7*PacketSize]); B0 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C1,T0); CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0); CJMADD(A1,B0,C5,B0);
} }
else else
{ {
PacketType B0, B1, B2, B3, A0, A1; PacketType B0, B1, B2, B3, A0, A1;
PacketType T0, T1; PacketType T0;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
@ -132,58 +203,58 @@ struct ei_gebp_kernel
B1 = ei_pload(&blB[1*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B2 = ei_pload(&blB[2*PacketSize]);
CJMADD(A1,B0,C4,T1); CJMADD(A1,B0,C4,B0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]);
CJMADD(A0,B1,C1,T0); CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1); CJMADD(A1,B1,C5,B1);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); B1 = ei_pload(&blB[5*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A1,B2,C6,T1); } CJMADD(A1,B2,C6,B2);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); } CJMADD(A0,B3,C3,T0);
A0 = ei_pload(&blA[2*PacketSize]); A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); } CJMADD(A1,B3,C7,B3);
A1 = ei_pload(&blA[3*PacketSize]); A1 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); B0 = ei_pload(&blB[8*PacketSize]);
CJMADD(A0,B1,C1,T0); CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1); CJMADD(A1,B1,C5,B1);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); B1 = ei_pload(&blB[9*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A1,B2,C6,T1); } CJMADD(A1,B2,C6,B2);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); } CJMADD(A0,B3,C3,T0);
A0 = ei_pload(&blA[4*PacketSize]); A0 = ei_pload(&blA[4*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); } CJMADD(A1,B3,C7,B3);
A1 = ei_pload(&blA[5*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]); B0 = ei_pload(&blB[12*PacketSize]);
CJMADD(A0,B1,C1,T0); CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1); CJMADD(A1,B1,C5,B1);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]); B1 = ei_pload(&blB[13*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A1,B2,C6,T1); } CJMADD(A1,B2,C6,B2);
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]); B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T0); } CJMADD(A0,B3,C3,T0);
A0 = ei_pload(&blA[6*PacketSize]); A0 = ei_pload(&blA[6*PacketSize]);
if(nr==4) { CJMADD(A1,B3,C7,T1); } CJMADD(A1,B3,C7,B3);
A1 = ei_pload(&blA[7*PacketSize]); A1 = ei_pload(&blA[7*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T1); CJMADD(A1,B0,C4,B0);
CJMADD(A0,B1,C1,T0); CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1); CJMADD(A1,B1,C5,B1);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A1,B2,C6,T1); } CJMADD(A1,B2,C6,B2);
if(nr==4) { CJMADD(A0,B3,C3,T0); } CJMADD(A0,B3,C3,T0);
if(nr==4) { CJMADD(A1,B3,C7,T1); } CJMADD(A1,B3,C7,B3);
} }
blB += 4*nr*PacketSize; blB += 4*nr*PacketSize;
@ -200,14 +271,14 @@ struct ei_gebp_kernel
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A1,B0,C4,T0); CJMADD(A1,B0,C4,B0);
B0 = ei_pload(&blB[1*PacketSize]); B0 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C1,T0); CJMADD(A0,B0,C1,T0);
CJMADD(A1,B0,C5,T0); CJMADD(A1,B0,C5,B0);
} }
else else
{ {
PacketType B0, B1, B2, B3, A0, A1, T0, T1; PacketType B0, B1, B2, B3, A0, A1, T0;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
A1 = ei_pload(&blA[1*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]);
@ -215,30 +286,49 @@ struct ei_gebp_kernel
B1 = ei_pload(&blB[1*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B2 = ei_pload(&blB[2*PacketSize]);
CJMADD(A1,B0,C4,T1); CJMADD(A1,B0,C4,B0);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
CJMADD(A0,B1,C1,T0); CJMADD(A0,B1,C1,T0);
CJMADD(A1,B1,C5,T1); CJMADD(A1,B1,C5,B1);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A1,B2,C6,T1); } CJMADD(A1,B2,C6,B2);
if(nr==4) { CJMADD(A0,B3,C3,T0); } CJMADD(A0,B3,C3,T0);
if(nr==4) { CJMADD(A1,B3,C7,T1); } CJMADD(A1,B3,C7,B3);
} }
blB += nr*PacketSize; blB += nr*PacketSize;
blA += mr; blA += mr;
} }
ei_pstoreu(&res[(j2+0)*resStride + i], C0); PacketType R0, R1, R2, R3, R4, R5, R6, R7;
ei_pstoreu(&res[(j2+1)*resStride + i], C1);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); R0 = ei_ploadu(r0);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); R1 = ei_ploadu(r1);
ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4); if(nr==4) R2 = ei_ploadu(r2);
ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5); if(nr==4) R3 = ei_ploadu(r3);
if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6); R4 = ei_ploadu(r0 + PacketSize);
if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7); R5 = ei_ploadu(r1 + PacketSize);
if(nr==4) R6 = ei_ploadu(r2 + PacketSize);
if(nr==4) R7 = ei_ploadu(r3 + PacketSize);
C0 = ei_padd(R0, C0);
C1 = ei_padd(R1, C1);
if(nr==4) C2 = ei_padd(R2, C2);
if(nr==4) C3 = ei_padd(R3, C3);
C4 = ei_padd(R4, C4);
C5 = ei_padd(R5, C5);
if(nr==4) C6 = ei_padd(R6, C6);
if(nr==4) C7 = ei_padd(R7, C7);
ei_pstoreu(r0, C0);
ei_pstoreu(r1, C1);
if(nr==4) ei_pstoreu(r2, C2);
if(nr==4) ei_pstoreu(r3, C3);
ei_pstoreu(r0 + PacketSize, C4);
ei_pstoreu(r1 + PacketSize, C5);
if(nr==4) ei_pstoreu(r2 + PacketSize, C6);
if(nr==4) ei_pstoreu(r3 + PacketSize, C7);
} }
if(rows-peeled_mc>=PacketSize) if(rows-peeled_mc>=PacketSize)
{ {
@ -256,81 +346,77 @@ struct ei_gebp_kernel
if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
// performs "inner" product // performs "inner" product
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; const Scalar* blB = unpackedB;
for(int k=0; k<peeled_kc; k+=4) for(int k=0; k<peeled_kc; k+=4)
{ {
if(nr==2) if(nr==2)
{ {
PacketType B0, T0, A0; PacketType B0, B1, A0;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
CJMADD(A0,B0,C0,T0); B1 = ei_pload(&blB[1*PacketSize]);
B0 = ei_pload(&blB[1*PacketSize]); CJMADD(A0,B0,C0,B0);
CJMADD(A0,B0,C1,T0);
A0 = ei_pload(&blA[1*PacketSize]);
B0 = ei_pload(&blB[2*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B1,C1,B1);
B0 = ei_pload(&blB[3*PacketSize]); A0 = ei_pload(&blA[1*PacketSize]);
CJMADD(A0,B0,C1,T0); B1 = ei_pload(&blB[3*PacketSize]);
CJMADD(A0,B0,C0,B0);
A0 = ei_pload(&blA[2*PacketSize]);
B0 = ei_pload(&blB[4*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B1,C1,B1);
B0 = ei_pload(&blB[5*PacketSize]); A0 = ei_pload(&blA[2*PacketSize]);
CJMADD(A0,B0,C1,T0); B1 = ei_pload(&blB[5*PacketSize]);
CJMADD(A0,B0,C0,B0);
A0 = ei_pload(&blA[3*PacketSize]);
B0 = ei_pload(&blB[6*PacketSize]); B0 = ei_pload(&blB[6*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B1,C1,B1);
B0 = ei_pload(&blB[7*PacketSize]); A0 = ei_pload(&blA[3*PacketSize]);
CJMADD(A0,B0,C1,T0); B1 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C0,B0);
CJMADD(A0,B1,C1,B1);
} }
else else
{ {
PacketType B0, B1, B2, B3, A0; PacketType B0, B1, B2, B3, A0;
PacketType T0, T1;
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,B0);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B2 = ei_pload(&blB[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]);
B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]);
CJMADD(A0,B1,C1,T1); CJMADD(A0,B1,C1,B1);
B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]); B1 = ei_pload(&blB[5*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,B2);
if(nr==4) B2 = ei_pload(&blB[6*PacketSize]); B2 = ei_pload(&blB[6*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); } CJMADD(A0,B3,C3,B3);
A0 = ei_pload(&blA[1*PacketSize]); A0 = ei_pload(&blA[1*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[7*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,B0);
B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]); B0 = ei_pload(&blB[8*PacketSize]);
CJMADD(A0,B1,C1,T1); CJMADD(A0,B1,C1,B1);
B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]); B1 = ei_pload(&blB[9*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,B2);
if(nr==4) B2 = ei_pload(&blB[10*PacketSize]); B2 = ei_pload(&blB[10*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); } CJMADD(A0,B3,C3,B3);
A0 = ei_pload(&blA[2*PacketSize]); A0 = ei_pload(&blA[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[11*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]);
CJMADD(A0,B0,C0,B0);
B0 = ei_pload(&blB[12*PacketSize]);
CJMADD(A0,B1,C1,B1);
B1 = ei_pload(&blB[13*PacketSize]);
CJMADD(A0,B2,C2,B2);
B2 = ei_pload(&blB[14*PacketSize]);
CJMADD(A0,B3,C3,B3);
CJMADD(A0,B0,C0,T0);
B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
CJMADD(A0,B1,C1,T1);
B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
if(nr==4) { CJMADD(A0,B2,C2,T0); }
if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
if(nr==4) { CJMADD(A0,B3,C3,T1); }
A0 = ei_pload(&blA[3*PacketSize]); A0 = ei_pload(&blA[3*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[15*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,B0);
CJMADD(A0,B1,C1,T1); CJMADD(A0,B1,C1,B1);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,B2);
if(nr==4) { CJMADD(A0,B3,C3,T1); } CJMADD(A0,B3,C3,B3);
} }
blB += 4*nr*PacketSize; blB += 4*nr*PacketSize;
@ -357,13 +443,13 @@ struct ei_gebp_kernel
A0 = ei_pload(&blA[0*PacketSize]); A0 = ei_pload(&blA[0*PacketSize]);
B0 = ei_pload(&blB[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]);
B1 = ei_pload(&blB[1*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]);
if(nr==4) B2 = ei_pload(&blB[2*PacketSize]); B2 = ei_pload(&blB[2*PacketSize]);
if(nr==4) B3 = ei_pload(&blB[3*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]);
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A0,B1,C1,T1); CJMADD(A0,B1,C1,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A0,B3,C3,T1); } CJMADD(A0,B3,C3,T1);
} }
blB += nr*PacketSize; blB += nr*PacketSize;
@ -384,14 +470,15 @@ struct ei_gebp_kernel
// gets a 1 x nr res block as registers // gets a 1 x nr res block as registers
Scalar C0(0), C1(0), C2(0), C3(0); Scalar C0(0), C1(0), C2(0), C3(0);
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr]; // TODO directly use blockB ???
const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr];
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
if(nr==2) if(nr==2)
{ {
Scalar B0, T0, A0; Scalar B0, T0, A0;
A0 = blA[0*PacketSize]; A0 = blA[k];
B0 = blB[0*PacketSize]; B0 = blB[0*PacketSize];
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
B0 = blB[1*PacketSize]; B0 = blB[1*PacketSize];
@ -405,13 +492,13 @@ struct ei_gebp_kernel
A0 = blA[k]; A0 = blA[k];
B0 = blB[0*PacketSize]; B0 = blB[0*PacketSize];
B1 = blB[1*PacketSize]; B1 = blB[1*PacketSize];
if(nr==4) B2 = blB[2*PacketSize]; B2 = blB[2*PacketSize];
if(nr==4) B3 = blB[3*PacketSize]; B3 = blB[3*PacketSize];
CJMADD(A0,B0,C0,T0); CJMADD(A0,B0,C0,T0);
CJMADD(A0,B1,C1,T1); CJMADD(A0,B1,C1,T1);
if(nr==4) { CJMADD(A0,B2,C2,T0); } CJMADD(A0,B2,C2,T0);
if(nr==4) { CJMADD(A0,B3,C3,T1); } CJMADD(A0,B3,C3,T1);
} }
blB += nr*PacketSize; blB += nr*PacketSize;
@ -427,6 +514,13 @@ struct ei_gebp_kernel
// => do the same but with nr==1 // => do the same but with nr==1
for(int j2=packet_cols; j2<cols; j2++) for(int j2=packet_cols; j2<cols; j2++)
{ {
// unpack B
{
const Scalar* blB = &blockB[j2*strideB+offsetB];
for(int k=0; k<depth; k++)
ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
}
for(int i=0; i<peeled_mc; i+=mr) for(int i=0; i<peeled_mc; i+=mr)
{ {
const Scalar* blA = &blockA[i*strideA+offsetA*mr]; const Scalar* blA = &blockA[i*strideA+offsetA*mr];
@ -436,12 +530,12 @@ struct ei_gebp_kernel
// TODO move the res loads to the stores // TODO move the res loads to the stores
// gets res block as register // get res block as registers
PacketType C0, C4; PacketType C0, C4;
C0 = ei_ploadu(&res[(j2+0)*resStride + i]); C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]); C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; const Scalar* blB = unpackedB;
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
PacketType B0, A0, A1, T0, T1; PacketType B0, A0, A1, T0, T1;
@ -469,7 +563,7 @@ struct ei_gebp_kernel
PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]); PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; const Scalar* blB = unpackedB;
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0); C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0);
@ -488,7 +582,8 @@ struct ei_gebp_kernel
// gets a 1 x 1 res block as registers // gets a 1 x 1 res block as registers
Scalar C0(0); Scalar C0(0);
const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB]; // FIXME directly use blockB ??
const Scalar* blB = unpackedB;
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0); C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
res[(j2+0)*resStride + i] += C0; res[(j2+0)*resStride + i] += C0;
@ -552,9 +647,9 @@ struct ei_gemm_pack_lhs
} }
}; };
// copy a complete panel of the rhs while expending each coefficient into a packet form // copy a complete panel of the rhs
// this version is optimized for column major matrices // this version is optimized for column major matrices
// The traversal order is as follow (nr==4): // The traversal order is as follow: (nr==4):
// 0 1 2 3 12 13 14 15 24 27 // 0 1 2 3 12 13 14 15 24 27
// 4 5 6 7 16 17 18 19 25 28 // 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29 // 8 9 10 11 20 21 22 23 26 29
@ -574,65 +669,51 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
for(int j2=0; j2<packet_cols; j2+=nr) for(int j2=0; j2<packet_cols; j2+=nr)
{ {
// skip what we have before // skip what we have before
if(PanelMode) count += PacketSize * nr * offset; if(PanelMode) count += nr * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride];
if (hasAlpha) if (hasAlpha)
{
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k])); blockB[count+0] = alpha*b0[k];
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k])); blockB[count+1] = alpha*b1[k];
if (nr==4) if(nr==4) blockB[count+2] = alpha*b2[k];
{ if(nr==4) blockB[count+3] = alpha*b3[k];
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k])); count += nr;
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
}
count += nr*PacketSize;
}
} }
else else
{
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k])); blockB[count+0] = b0[k];
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k])); blockB[count+1] = b1[k];
if (nr==4) if(nr==4) blockB[count+2] = b2[k];
{ if(nr==4) blockB[count+3] = b3[k];
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k])); count += nr;
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
}
count += nr*PacketSize;
}
} }
// skip what we have after // skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth); if(PanelMode) count += nr * (stride-offset-depth);
} }
// copy the remaining columns one at a time (nr==1) // copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2) for(int j2=packet_cols; j2<cols; ++j2)
{ {
if(PanelMode) count += PacketSize * offset; if(PanelMode) count += offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride]; const Scalar* b0 = &rhs[(j2+0)*rhsStride];
if (hasAlpha) if (hasAlpha)
{
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k])); blockB[count] = alpha*b0[k];
count += PacketSize; count += 1;
}
} }
else else
{
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count], ei_pset1(b0[k])); blockB[count] = b0[k];
count += PacketSize; count += 1;
} }
} if(PanelMode) count += (stride-offset-depth);
if(PanelMode) count += PacketSize * (stride-offset-depth);
} }
} }
}; };
@ -652,17 +733,17 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
for(int j2=0; j2<packet_cols; j2+=nr) for(int j2=0; j2<packet_cols; j2+=nr)
{ {
// skip what we have before // skip what we have before
if(PanelMode) count += PacketSize * nr * offset; if(PanelMode) count += nr * offset;
if (hasAlpha) if (hasAlpha)
{ {
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
const Scalar* b0 = &rhs[k*rhsStride + j2]; const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0])); blockB[count+0] = alpha*b0[0];
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1])); blockB[count+1] = alpha*b0[1];
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2])); if(nr==4) blockB[count+2] = alpha*b0[2];
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3])); if(nr==4) blockB[count+3] = alpha*b0[3];
count += nr*PacketSize; count += nr;
} }
} }
else else
@ -670,27 +751,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
const Scalar* b0 = &rhs[k*rhsStride + j2]; const Scalar* b0 = &rhs[k*rhsStride + j2];
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0])); blockB[count+0] = b0[0];
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1])); blockB[count+1] = b0[1];
if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2])); if(nr==4) blockB[count+2] = b0[2];
if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3])); if(nr==4) blockB[count+3] = b0[3];
count += nr*PacketSize; count += nr;
} }
} }
// skip what we have after // skip what we have after
if(PanelMode) count += PacketSize * nr * (stride-offset-depth); if(PanelMode) count += nr * (stride-offset-depth);
} }
// copy the remaining columns one at a time (nr==1) // copy the remaining columns one at a time (nr==1)
for(int j2=packet_cols; j2<cols; ++j2) for(int j2=packet_cols; j2<cols; ++j2)
{ {
if(PanelMode) count += PacketSize * offset; if(PanelMode) count += offset;
const Scalar* b0 = &rhs[j2]; const Scalar* b0 = &rhs[j2];
for(int k=0; k<depth; k++) for(int k=0; k<depth; k++)
{ {
ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride])); blockB[count] = alpha*b0[k*rhsStride];
count += PacketSize; count += 1;
} }
if(PanelMode) count += PacketSize * (stride-offset-depth); if(PanelMode) count += stride-offset-depth;
} }
} }
}; };

View File

@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
const Scalar* lhs, int lhsStride, const Scalar* lhs, int lhsStride,
const Scalar* rhs, int rhsStride, const Scalar* rhs, int rhsStride,
Scalar* res, int resStride, Scalar* res, int resStride,
Scalar alpha) Scalar alpha,
GemmParallelInfo<Scalar>* info = 0)
{ {
// transpose the product such that the result is column major // transpose the product such that the result is column major
ei_general_matrix_matrix_product<Scalar, ei_general_matrix_matrix_product<Scalar,
@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
LhsStorageOrder==RowMajor ? ColMajor : RowMajor, LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
ConjugateLhs, ConjugateLhs,
ColMajor> ColMajor>
::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info);
} }
}; };
@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth,
const Scalar* _lhs, int lhsStride, const Scalar* _lhs, int lhsStride,
const Scalar* _rhs, int rhsStride, const Scalar* _rhs, int rhsStride,
Scalar* res, int resStride, Scalar* res, int resStride,
Scalar alpha) Scalar alpha,
GemmParallelInfo<Scalar>* info = 0)
{ {
ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride); ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride); ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
@ -78,8 +80,91 @@ static void run(int rows, int cols, int depth,
int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction int kc = std::min<int>(Blocking::Max_kc,depth); // cache block size along the K direction
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
{
// this is the parallel version!
int tid = omp_get_thread_num();
int threads = omp_get_num_threads();
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
Scalar* blockB = (Scalar*)info[tid].blockB;
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1)
for(int k=0; k<depth; k+=kc)
{
const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A'
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing A'.
pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
// Pack B_k to B' in parallel fashion:
// each thread packs the sub block B_k,j to B'_j where j is the thread id.
// However, before copying to B'_j, we have to make sure that no other thread is still using it,
// i.e., we test that info[tid].users equals 0.
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {}
info[tid].users += threads;
pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
// Notify the other threads that the part B'_j is ready to go.
info[tid].sync = k;
// Computes C_i += A' * B' per B'_j
for(int shift=0; shift<threads; ++shift)
{
int j = (tid+shift)%threads;
// At this point we have to make sure that B'_j has been updated by the thread j,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0)
while(info[j].sync!=k) {}
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
}
// Then keep going as usual with the remaining A'
for(int i=mc; i<rows; i+=mc)
{
const int actual_mc = std::min(i+mc,rows)-i;
// pack A_i,k to A'
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
// C_i += A' * B'
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
}
// Release all the sub blocks B'_j of B' for the current thread,
// i.e., we simply decrement the number of users by 1
for(int j=0; j<threads; ++j)
#pragma omp atomic
--(info[j].users);
}
ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, w, sizeW);
}
else
#endif // EIGEN_HAS_OPENMP
{
(void)info; // info is not used
// this is the sequential version!
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
// For each horizontal panel of the rhs, and corresponding panel of the lhs... // For each horizontal panel of the rhs, and corresponding panel of the lhs...
// (==GEMM_VAR1) // (==GEMM_VAR1)
@ -91,7 +176,8 @@ static void run(int rows, int cols, int depth,
// => Pack rhs's panel into a sequential chunk of memory (L2 caching) // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
// Note that this panel will be read as many times as the number of blocks in the lhs's // Note that this panel will be read as many times as the number of blocks in the lhs's
// vertical panel which is, in practice, a very low number. // vertical panel which is, in practice, a very low number.
ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols); pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
// For each mc x kc block of the lhs's vertical panel... // For each mc x kc block of the lhs's vertical panel...
// (==GEPP_VAR1) // (==GEPP_VAR1)
@ -102,16 +188,17 @@ static void run(int rows, int cols, int depth,
// We pack the lhs's block into a sequential chunk of memory (L1 caching) // We pack the lhs's block into a sequential chunk of memory (L1 caching)
// Note that this block will be read a very high number of times, which is equal to the number of // Note that this block will be read a very high number of times, which is equal to the number of
// micro vertical panel of the large rhs's panel (e.g., cols/4 times). // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
// Everything is packed, we can now call the block * panel kernel: // Everything is packed, we can now call the block * panel kernel:
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >() gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
} }
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
}
} }
}; };
@ -128,6 +215,40 @@ struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
: ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
{}; {};
template<typename Scalar, typename Gemm, typename Lhs, typename Rhs, typename Dest>
struct ei_gemm_functor
{
typedef typename Rhs::Scalar BlockBScalar;
ei_gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha)
: m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
{}
void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo<BlockBScalar>* info=0) const
{
if(cols==-1)
cols = m_rhs.cols();
Gemm::run(rows, cols, m_lhs.cols(),
(const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(),
(const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(),
(Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
m_actualAlpha,
info);
}
int sharedBlockBSize() const
{
return std::min<int>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols();
}
protected:
const Lhs& m_lhs;
const Rhs& m_rhs;
mutable Dest& m_dest;
Scalar m_actualAlpha;
};
template<typename Lhs, typename Rhs> template<typename Lhs, typename Rhs>
class GeneralProduct<Lhs, Rhs, GemmProduct> class GeneralProduct<Lhs, Rhs, GemmProduct>
: public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
@ -151,17 +272,18 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs); * RhsBlasTraits::extractScalarFactor(m_rhs);
typedef ei_gemm_functor<
Scalar,
ei_general_matrix_matrix_product< ei_general_matrix_matrix_product<
Scalar, Scalar,
(_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
(_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor> (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
::run( _ActualLhsType,
this->rows(), this->cols(), lhs.cols(), _ActualRhsType,
(const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(), Dest> GemmFunctor;
(const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
(Scalar*)&(dst.coeffRef(0,0)), dst.stride(), ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32)>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
actualAlpha);
} }
}; };

View File

@ -0,0 +1,97 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2010 Gael Guennebaud <g.gael@free.fr>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#ifndef EIGEN_PARALLELIZER_H
#define EIGEN_PARALLELIZER_H
template<typename BlockBScalar> struct GemmParallelInfo
{
GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0), blockB(0) {}
int volatile sync;
int volatile users;
int rhs_start;
int rhs_length;
BlockBScalar* blockB;
};
template<bool Condition,typename Functor>
void ei_parallelize_gemm(const Functor& func, int rows, int cols)
{
#ifndef EIGEN_HAS_OPENMP
func(0,rows, 0,cols);
#else
// Dynamically check whether we should enable or disable OpenMP.
// The conditions are:
// - the max number of threads we can create is greater than 1
// - we are not already in a parallel code
// - the sizes are large enough
// 1- are we already in a parallel session?
// FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
if((!Condition) || (omp_get_num_threads()>1))
return func(0,rows, 0,cols);
// 2- compute the maximal number of threads from the size of the product:
// FIXME this has to be fine tuned
int max_threads = std::max(1,rows / 32);
// 3 - compute the number of threads we are going to use
int threads = std::min(omp_get_max_threads(), max_threads);
if(threads==1)
return func(0,rows, 0,cols);
int blockCols = (cols / threads) & ~0x3;
int blockRows = (rows / threads) & ~0x7;
typedef typename Functor::BlockBScalar BlockBScalar;
BlockBScalar* sharedBlockB = new BlockBScalar[func.sharedBlockBSize()];
GemmParallelInfo<BlockBScalar>* info = new GemmParallelInfo<BlockBScalar>[threads];
#pragma omp parallel for schedule(static,1) num_threads(threads)
for(int i=0; i<threads; ++i)
{
int r0 = i*blockRows;
int actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
int c0 = i*blockCols;
int actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
info[i].rhs_start = c0;
info[i].rhs_length = actualBlockCols;
info[i].blockB = sharedBlockB;
func(r0, actualBlockRows, 0,cols, info);
}
delete[] sharedBlockB;
delete[] info;
#endif
}
#endif // EIGEN_PARALLELIZER_H

View File

@ -43,7 +43,10 @@ struct ei_symm_pack_lhs
{ {
for(int w=0; w<h; w++) for(int w=0; w<h; w++)
blockA[count++] = ei_conj(lhs(k, i+w)); // transposed blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
for(int w=h; w<BlockRows; w++)
blockA[count++] = ei_real(lhs(k,k)); // real (diagonal)
for(int w=h+1; w<BlockRows; w++)
blockA[count++] = lhs(i+w, k); // normal blockA[count++] = lhs(i+w, k); // normal
++h; ++h;
} }
@ -71,8 +74,11 @@ struct ei_symm_pack_lhs
// do the same with mr==1 // do the same with mr==1
for(int i=peeled_mc; i<rows; i++) for(int i=peeled_mc; i<rows; i++)
{ {
for(int k=0; k<=i; k++) for(int k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal blockA[count++] = lhs(i, k); // normal
blockA[count++] = ei_real(lhs(i, i)); // real (diagonal)
for(int k=i+1; k<cols; k++) for(int k=i+1; k<cols; k++)
blockA[count++] = ei_conj(lhs(k, i)); // transposed blockA[count++] = ei_conj(lhs(k, i)); // transposed
} }
@ -95,14 +101,14 @@ struct ei_symm_pack_rhs
{ {
for(int k=k2; k<end_k; k++) for(int k=k2; k<end_k; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0))); blockB[count+0] = alpha*rhs(k,j2+0);
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1))); blockB[count+1] = alpha*rhs(k,j2+1);
if (nr==4) if (nr==4)
{ {
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2))); blockB[count+2] = alpha*rhs(k,j2+2);
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3))); blockB[count+3] = alpha*rhs(k,j2+3);
} }
count += nr*PacketSize; count += nr;
} }
} }
@ -113,14 +119,14 @@ struct ei_symm_pack_rhs
// transpose // transpose
for(int k=k2; k<j2; k++) for(int k=k2; k<j2; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k)))); blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k)))); blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
if (nr==4) if (nr==4)
{ {
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k)))); blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k)))); blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
} }
count += nr*PacketSize; count += nr;
} }
// symmetric // symmetric
int h = 0; int h = 0;
@ -128,24 +134,27 @@ struct ei_symm_pack_rhs
{ {
// normal // normal
for (int w=0 ; w<h; ++w) for (int w=0 ; w<h; ++w)
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w))); blockB[count+w] = alpha*rhs(k,j2+w);
blockB[count+h] = alpha*rhs(k,k);
// transpose // transpose
for (int w=h ; w<nr; ++w) for (int w=h+1 ; w<nr; ++w)
ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+w,k)))); blockB[count+w] = alpha*ei_conj(rhs(j2+w,k));
count += nr*PacketSize; count += nr;
++h; ++h;
} }
// normal // normal
for(int k=j2+nr; k<end_k; k++) for(int k=j2+nr; k<end_k; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0))); blockB[count+0] = alpha*rhs(k,j2+0);
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1))); blockB[count+1] = alpha*rhs(k,j2+1);
if (nr==4) if (nr==4)
{ {
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2))); blockB[count+2] = alpha*rhs(k,j2+2);
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3))); blockB[count+3] = alpha*rhs(k,j2+3);
} }
count += nr*PacketSize; count += nr;
} }
} }
@ -154,14 +163,14 @@ struct ei_symm_pack_rhs
{ {
for(int k=k2; k<end_k; k++) for(int k=k2; k<end_k; k++)
{ {
ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k)))); blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k)))); blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
if (nr==4) if (nr==4)
{ {
ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k)))); blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k)))); blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
} }
count += nr*PacketSize; count += nr;
} }
} }
@ -172,14 +181,23 @@ struct ei_symm_pack_rhs
int half = std::min(end_k,j2); int half = std::min(end_k,j2);
for(int k=k2; k<half; k++) for(int k=k2; k<half; k++)
{ {
ei_pstore(&blockB[count], ei_pset1(alpha*ei_conj(rhs(j2,k)))); blockB[count] = alpha*ei_conj(rhs(j2,k));
count += PacketSize; count += 1;
} }
// normal
for(int k=half; k<k2+rows; k++) if(half==j2 && half<k2+rows)
{ {
ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2))); blockB[count] = alpha*ei_real(rhs(j2,j2));
count += PacketSize; count += 1;
}
else
half--;
// normal
for(int k=half+1; k<k2+rows; k++)
{
blockB[count] = alpha*rhs(k,j2);
count += 1;
} }
} }
} }
@ -244,9 +262,14 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder> pack_rhs;
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(int k2=0; k2<size; k2+=kc) for(int k2=0; k2<size; k2+=kc)
{ {
@ -255,8 +278,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
// we have selected one row panel of rhs and one column panel of lhs // we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory // pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse // and expand each coeff to a constant packet for further reuse
ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>() pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
// the select lhs's panel has to be split in three different parts: // the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy // 1 - the transposed panel above the diagonal block => transposed packed copy
@ -266,8 +288,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
{ {
const int actual_mc = std::min(i2+mc,k2)-i2; const int actual_mc = std::min(i2+mc,k2)-i2;
// transposed packed copy // transposed packed copy
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true>() pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
} }
@ -275,8 +296,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
{ {
const int actual_mc = std::min(k2+kc,size)-k2; const int actual_mc = std::min(k2+kc,size)-k2;
// symmetric packed copy // symmetric packed copy
ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>() pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols); gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
} }
@ -292,7 +312,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };
@ -323,30 +343,32 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder> pack_lhs;
ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder> pack_rhs;
for(int k2=0; k2<size; k2+=kc) for(int k2=0; k2<size; k2+=kc)
{ {
const int actual_kc = std::min(k2+kc,size)-k2; const int actual_kc = std::min(k2+kc,size)-k2;
ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>() pack_rhs(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
// => GEPP // => GEPP
for(int i2=0; i2<rows; i2+=mc) for(int i2=0; i2<rows; i2+=mc)
{ {
const int actual_mc = std::min(i2+mc,rows)-i2; const int actual_mc = std::min(i2+mc,rows)-i2;
ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>() pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
} }
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };
@ -394,9 +416,9 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> ei_traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
::run( ::run(
lhs.rows(), rhs.cols(), // sizes lhs.rows(), rhs.cols(), // sizes
&lhs.coeff(0,0), lhs.stride(), // lhs info &lhs.coeff(0,0), lhs.outerStride(), // lhs info
&rhs.coeff(0,0), rhs.stride(), // rhs info &rhs.coeff(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.stride(), // result info &dst.coeffRef(0,0), dst.outerStride(), // result info
actualAlpha // alpha actualAlpha // alpha
); );
} }

View File

@ -185,12 +185,12 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
* RhsBlasTraits::extractScalarFactor(m_rhs); * RhsBlasTraits::extractScalarFactor(m_rhs);
ei_assert(dst.stride()==1 && "not implemented yet"); ei_assert(dst.innerStride()==1 && "not implemented yet");
ei_product_selfadjoint_vector<Scalar, (ei_traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> ei_product_selfadjoint_vector<Scalar, (ei_traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
( (
lhs.rows(), // size lhs.rows(), // size
&lhs.coeff(0,0), lhs.stride(), // lhs info &lhs.coeff(0,0), lhs.innerStride(), // lhs info
&rhs.coeff(0), rhs.stride(), // rhs info &rhs.coeff(0), rhs.innerStride(), // rhs info
&dst.coeffRef(0), // result info &dst.coeffRef(0), // result info
actualAlpha // scale factor actualAlpha // scale factor
); );

View File

@ -74,47 +74,51 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
// note that the actual rhs is the transpose/adjoint of mat // note that the actual rhs is the transpose/adjoint of mat
typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj; typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, Conj> gebp_kernel; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, Conj> gebp_kernel;
ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
ei_gemm_pack_lhs<Scalar,Blocking::mr,MatStorageOrder, false> pack_lhs;
ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo> sybb;
for(int k2=0; k2<depth; k2+=kc) for(int k2=0; k2<depth; k2+=kc)
{ {
const int actual_kc = std::min(k2+kc,depth)-k2; const int actual_kc = std::min(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat // note that the actual rhs is the transpose/adjoint of mat
ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor>() pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
(blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
for(int i2=0; i2<size; i2+=mc) for(int i2=0; i2<size; i2+=mc)
{ {
const int actual_mc = std::min(i2+mc,size)-i2; const int actual_mc = std::min(i2+mc,size)-i2;
ei_gemm_pack_lhs<Scalar,Blocking::mr,MatStorageOrder, false>() pack_lhs(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc);
(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part: // the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped // 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped // 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower) if (UpLo==Lower)
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2)); gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2),
-1, -1, 0, 0, allocatedBlockB);
ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo>() sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB);
(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc);
if (UpLo==Upper) if (UpLo==Upper)
{ {
int j2 = i2+actual_mc; int j2 = i2+actual_mc;
gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2)); gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(0,size-j2),
-1, -1, 0, 0, allocatedBlockB);
} }
} }
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };
@ -138,8 +142,8 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
_ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor, _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor,
ei_traits<MatrixType>::Flags&RowMajorBit ? RowMajor : ColMajor, ei_traits<MatrixType>::Flags&RowMajorBit ? RowMajor : ColMajor,
!UBlasTraits::NeedToConjugate, UpLo> !UBlasTraits::NeedToConjugate, UpLo>
::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.stride(), ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.outerStride(),
const_cast<Scalar*>(_expression().data()), _expression().stride(), actualAlpha); const_cast<Scalar*>(_expression().data()), _expression().outerStride(), actualAlpha);
return *this; return *this;
} }
@ -161,7 +165,7 @@ struct ei_sybb_kernel
PacketSize = ei_packet_traits<Scalar>::size, PacketSize = ei_packet_traits<Scalar>::size,
BlockSize = EIGEN_ENUM_MAX(mr,nr) BlockSize = EIGEN_ENUM_MAX(mr,nr)
}; };
void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth) void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth, Scalar* workspace)
{ {
ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel; ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel;
Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer; Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
@ -171,7 +175,7 @@ struct ei_sybb_kernel
for (int j=0; j<size; j+=BlockSize) for (int j=0; j<size; j+=BlockSize)
{ {
int actualBlockSize = std::min<int>(BlockSize,size - j); int actualBlockSize = std::min<int>(BlockSize,size - j);
const Scalar* actual_b = blockB+j*depth*PacketSize; const Scalar* actual_b = blockB+j*depth;
if(UpLo==Upper) if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize); gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize);
@ -181,7 +185,8 @@ struct ei_sybb_kernel
int i = j; int i = j;
buffer.setZero(); buffer.setZero();
// 1 - apply the kernel on the temporary buffer // 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize); gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize,
-1, -1, 0, 0, workspace);
// 2 - triangular accumulation // 2 - triangular accumulation
for(int j1=0; j1<actualBlockSize; ++j1) for(int j1=0; j1<actualBlockSize; ++j1)
{ {
@ -195,7 +200,8 @@ struct ei_sybb_kernel
if(UpLo==Lower) if(UpLo==Lower)
{ {
int i = j+actualBlockSize; int i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize); gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize,
-1, -1, 0, 0, workspace);
} }
} }
} }

View File

@ -88,7 +88,7 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::ret>::type, typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::ret>::type,
typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::ret>::type, typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::ret>::type,
(IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
::run(const_cast<Scalar*>(_expression().data()),_expression().stride(),actualU,actualV,actualAlpha); ::run(const_cast<Scalar*>(_expression().data()),_expression().outerStride(),actualU,actualV,actualAlpha);
return *this; return *this;
} }

View File

@ -120,7 +120,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
// Scalar* allocatedBlockB = new Scalar[sizeB];
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
// => GEBP with the micro triangular block // => GEBP with the micro triangular block
// The trick is to pack this micro block while filling the opposite triangular part with zeros. // The trick is to pack this micro block while filling the opposite triangular part with zeros.
// To this end we do an extra triangular copy to small temporary buffer // To this end we do an extra triangular copy to a small temporary buffer
for (int k=0;k<actualPanelWidth;++k) for (int k=0;k<actualPanelWidth;++k)
{ {
if (!(Mode&UnitDiag)) if (!(Mode&UnitDiag))
@ -163,10 +166,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
for (int i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) for (int i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
} }
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth); pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel // GEBP with remaining micro panel
if (lengthTarget>0) if (lengthTarget>0)
@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -196,7 +199,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
// delete[] allocatedBlockB;
} }
}; };
@ -234,7 +238,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,rows); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
triangularBuffer.setZero(); triangularBuffer.setZero();
@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
const int actual_kc = std::min(IsLower ? size-k2 : k2, kc); const int actual_kc = std::min(IsLower ? size-k2 : k2, kc);
int actual_k2 = IsLower ? k2 : k2-actual_kc; int actual_k2 = IsLower ? k2 : k2-actual_kc;
int rs = IsLower ? actual_k2 : size - k2; int rs = IsLower ? actual_k2 : size - k2;
Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize; Scalar* geb = blockB+actual_kc*actual_kc;
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs); pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
@ -265,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int panelOffset = IsLower ? j2+actualPanelWidth : 0; int panelOffset = IsLower ? j2+actualPanelWidth : 0;
int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part // general part
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha, &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
panelLength, actualPanelWidth, panelLength, actualPanelWidth,
actual_kc, panelOffset); actual_kc, panelOffset);
@ -279,8 +285,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j); triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
} }
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, pack_rhs_panel(blockB+j2*actual_kc,
triangularBuffer.data(), triangularBuffer.stride(), alpha, triangularBuffer.data(), triangularBuffer.outerStride(), alpha,
actualPanelWidth, actualPanelWidth, actualPanelWidth, actualPanelWidth,
actual_kc, j2); actual_kc, j2);
} }
@ -300,19 +306,21 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
int blockOffset = IsLower ? j2 : 0; int blockOffset = IsLower ? j2 : 0;
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
blockA, blockB+j2*actual_kc*Blocking::PacketSize, blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
blockOffset, blockOffset*Blocking::PacketSize);// offsets blockOffset, blockOffset,// offsets
allocatedBlockB); // workspace
} }
} }
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
blockA, geb, actual_mc, actual_kc, rs); blockA, geb, actual_mc, actual_kc, rs,
-1, -1, 0, 0, allocatedBlockB);
} }
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };
@ -348,9 +356,9 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
(ei_traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> (ei_traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor>
::run( ::run(
lhs.rows(), LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes lhs.rows(), LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes
&lhs.coeff(0,0), lhs.stride(), // lhs info &lhs.coeff(0,0), lhs.outerStride(), // lhs info
&rhs.coeff(0,0), rhs.stride(), // rhs info &rhs.coeff(0,0), rhs.outerStride(), // rhs info
&dst.coeffRef(0,0), dst.stride(), // result info &dst.coeffRef(0,0), dst.outerStride(), // result info
actualAlpha // alpha actualAlpha // alpha
); );
} }

View File

@ -63,7 +63,7 @@ struct ei_product_triangular_vector_selector<Lhs,Rhs,Result,Mode,ConjLhs,ConjRhs
int s = IsLower ? pi+actualPanelWidth : 0; int s = IsLower ? pi+actualPanelWidth : 0;
ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>( ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>(
r, r,
&(lhs.const_cast_derived().coeffRef(s,pi)), lhs.stride(), &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(),
rhs.segment(pi, actualPanelWidth), rhs.segment(pi, actualPanelWidth),
&(res.coeffRef(s)), &(res.coeffRef(s)),
alpha); alpha);
@ -105,7 +105,7 @@ struct ei_product_triangular_vector_selector<Lhs,Rhs,Result,Mode,ConjLhs,ConjRhs
int s = IsLower ? 0 : pi + actualPanelWidth; int s = IsLower ? 0 : pi + actualPanelWidth;
Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1); Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1);
ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>( ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>(
&(lhs.const_cast_derived().coeffRef(pi,s)), lhs.stride(), &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
&(rhs.const_cast_derived().coeffRef(s)), r, &(rhs.const_cast_derived().coeffRef(s)), r,
target, alpha); target, alpha);
} }

View File

@ -67,11 +67,14 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
ei_conj_if<Conjugate> conj; ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel;
ei_gemm_pack_lhs<Scalar,Blocking::mr,TriStorageOrder> pack_lhs; ei_gemm_pack_lhs<Scalar,Blocking::mr,TriStorageOrder> pack_lhs;
ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true> pack_rhs;
for(int k2=IsLower ? 0 : size; for(int k2=IsLower ? 0 : size;
IsLower ? k2<size : k2>0; IsLower ? k2<size : k2>0;
@ -135,8 +138,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
int blockBOffset = IsLower ? k1 : lengthTarget; int blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other // update the respective rows of B from other
ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>() pack_rhs(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
// GEBP // GEBP
if (lengthTarget>0) if (lengthTarget>0)
@ -146,7 +148,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize); actualPanelWidth, actual_kc, 0, blockBOffset);
} }
} }
} }
@ -169,7 +171,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };
@ -198,7 +200,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction int mc = std::min<int>(Blocking::Max_mc,size); // cache block size along the M direction
Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc); Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize); std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
ei_conj_if<Conjugate> conj; ei_conj_if<Conjugate> conj;
ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel; ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel;
@ -215,7 +219,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
int startPanel = IsLower ? 0 : k2+actual_kc; int startPanel = IsLower ? 0 : k2+actual_kc;
int rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; int rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize; Scalar* geb = blockB+actual_kc*actual_kc;
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs); if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs);
@ -230,7 +234,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
if (panelLength>0) if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize, pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), triStride, -1, &rhs(actual_k2+panelOffset, actual_j2), triStride, -1,
panelLength, actualPanelWidth, panelLength, actualPanelWidth,
actual_kc, panelOffset); actual_kc, panelOffset);
@ -260,10 +264,11 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
if(panelLength>0) if(panelLength>0)
{ {
gebp_kernel(&lhs(i2,absolute_j2), otherStride, gebp_kernel(&lhs(i2,absolute_j2), otherStride,
blockA, blockB+j2*actual_kc*Blocking::PacketSize, blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth, actual_mc, panelLength, actualPanelWidth,
actual_kc, actual_kc, // strides actual_kc, actual_kc, // strides
panelOffset, panelOffset*Blocking::PacketSize); // offsets panelOffset, panelOffset, // offsets
allocatedBlockB); // workspace
} }
// unblocked triangular solve // unblocked triangular solve
@ -293,12 +298,13 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
if (rs>0) if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
actual_mc, actual_kc, rs); actual_mc, actual_kc, rs,
-1, -1, 0, 0, allocatedBlockB);
} }
} }
ei_aligned_stack_delete(Scalar, blockA, kc*mc); ei_aligned_stack_delete(Scalar, blockA, kc*mc);
ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize); ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
} }
}; };

View File

@ -130,14 +130,10 @@ struct ei_product_blocking_traits
typedef typename ei_packet_traits<Scalar>::type PacketType; typedef typename ei_packet_traits<Scalar>::type PacketType;
enum { enum {
PacketSize = sizeof(PacketType)/sizeof(Scalar), PacketSize = sizeof(PacketType)/sizeof(Scalar),
#if (defined __i386__) NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
HalfRegisterCount = 4,
#else
HalfRegisterCount = 8,
#endif
// register block size along the N direction (must be either 2 or 4) // register block size along the N direction (must be either 2 or 4)
nr = HalfRegisterCount/2, nr = NumberOfRegisters/4,
// register block size along the M direction (currently, this one cannot be modified) // register block size along the M direction (currently, this one cannot be modified)
mr = 2 * PacketSize, mr = 2 * PacketSize,
@ -236,4 +232,22 @@ struct ei_blas_traits<Transpose<NestedXpr> >
static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); } static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
}; };
template<typename T, int Access=ei_blas_traits<T>::ActualAccess>
struct ei_extract_data_selector {
static const typename T::Scalar* run(const T& m)
{
return &ei_blas_traits<T>::extract(m).const_cast_derived().coeffRef(0,0); // FIXME this should be .data()
}
};
template<typename T>
struct ei_extract_data_selector<T,NoDirectAccess> {
static typename T::Scalar* run(const T&) { return 0; }
};
template<typename T> const typename T::Scalar* ei_extract_data(const T& m)
{
return ei_extract_data_selector<T>::run(m);
}
#endif // EIGEN_BLASUTIL_H #endif // EIGEN_BLASUTIL_H

View File

@ -86,11 +86,11 @@ const unsigned int EvalBeforeAssigningBit = 0x4;
* Long version: means that the coefficients can be handled by packets * Long version: means that the coefficients can be handled by packets
* and start at a memory location whose alignment meets the requirements * and start at a memory location whose alignment meets the requirements
* of the present CPU architecture for optimized packet access. In the fixed-size * of the present CPU architecture for optimized packet access. In the fixed-size
* case, there is the additional condition that the total size of the coefficients * case, there is the additional condition that it be possible to access all the
* array is a multiple of the packet size, so that it is possible to access all the * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
* coefficients by packets. In the dynamic-size case, there is no such condition * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
* on the total size, so it might not be possible to access the few last coeffs * there is no such condition on the total size and strides, so it might not be possible to access
* by packets. * all coeffs by packets.
* *
* \note This bit can be set regardless of whether vectorization is actually enabled. * \note This bit can be set regardless of whether vectorization is actually enabled.
* To check for actual vectorizability, see \a ActualPacketAccessBit. * To check for actual vectorizability, see \a ActualPacketAccessBit.
@ -140,7 +140,7 @@ const unsigned int LinearAccessBit = 0x10;
* Means that the underlying array of coefficients can be directly accessed. This means two things. * Means that the underlying array of coefficients can be directly accessed. This means two things.
* First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
* expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
* array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit. * array of coefficients must be exactly the natural one suggested by rows(), cols(), outerStride(), innerStride(), and the RowMajorBit.
* This rules out expressions such as Diagonal, whose coefficients, though referencable, do not have * This rules out expressions such as Diagonal, whose coefficients, though referencable, do not have
* such a regular memory layout. * such a regular memory layout.
*/ */

View File

@ -1,7 +1,8 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -40,8 +41,10 @@ template<typename ExpressionType> class NestByValue;
template<typename ExpressionType> class ForceAlignedAccess; template<typename ExpressionType> class ForceAlignedAccess;
template<typename ExpressionType> class SwapWrapper; template<typename ExpressionType> class SwapWrapper;
template<typename MatrixType> class Minor; template<typename MatrixType> class Minor;
// MSVC will not compile when the expression ei_traits<MatrixType>::Flags&DirectAccessBit
// is put into brackets like (ei_traits<MatrixType>::Flags&DirectAccessBit)!
template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic, template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic,
int _DirectAccessStatus = (ei_traits<MatrixType>::Flags&DirectAccessBit) ? HasDirectAccess : NoDirectAccess> class Block; int _DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit ? HasDirectAccess : NoDirectAccess> class Block;
template<typename MatrixType, int Size=Dynamic> class VectorBlock; template<typename MatrixType, int Size=Dynamic> class VectorBlock;
template<typename MatrixType> class Transpose; template<typename MatrixType> class Transpose;
template<typename MatrixType> class Conjugate; template<typename MatrixType> class Conjugate;
@ -60,7 +63,9 @@ template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeA
template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct; template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
template<typename MatrixType, int Index> class Diagonal; template<typename MatrixType, int Index> class Diagonal;
template<typename MatrixType, int Options=Unaligned> class Map; template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
template<typename MatrixType, int Options=Unaligned, typename StrideType = Stride<0,0> > class Map;
template<typename Derived> class TriangularBase; template<typename Derived> class TriangularBase;
template<typename MatrixType, unsigned int Mode> class TriangularView; template<typename MatrixType, unsigned int Mode> class TriangularView;
template<typename MatrixType, unsigned int Mode> class SelfAdjointView; template<typename MatrixType, unsigned int Mode> class SelfAdjointView;

View File

@ -26,8 +26,6 @@
#ifndef EIGEN_MACROS_H #ifndef EIGEN_MACROS_H
#define EIGEN_MACROS_H #define EIGEN_MACROS_H
#undef minor
#define EIGEN_WORLD_VERSION 2 #define EIGEN_WORLD_VERSION 2
#define EIGEN_MAJOR_VERSION 91 #define EIGEN_MAJOR_VERSION 91
#define EIGEN_MINOR_VERSION 0 #define EIGEN_MINOR_VERSION 0
@ -35,14 +33,10 @@
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
EIGEN_MINOR_VERSION>=z)))) EIGEN_MINOR_VERSION>=z))))
#ifdef __GNUC__
// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable 16 byte alignment on all #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__>=x && __GNUC_MINOR__>=y) || __GNUC__>x)
// platforms where vectorization might be enabled. In theory we could always enable alignment, but it can be a cause of problems
// on some platforms, so we just disable it in certain common platform (compiler+architecture combinations) to avoid these problems.
#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 1
#else #else
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 0 #define EIGEN_GNUC_AT_LEAST(x,y) 0
#endif #endif
#if defined(__GNUC__) && (__GNUC__ <= 3) #if defined(__GNUC__) && (__GNUC__ <= 3)
@ -51,22 +45,44 @@
#define EIGEN_GCC3_OR_OLDER 0 #define EIGEN_GCC3_OR_OLDER 0
#endif #endif
// FIXME vectorization + alignment is completely disabled with sun studio // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT && !EIGEN_GCC3_OR_OLDER && !defined(__SUNPRO_CC) // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
#define EIGEN_ARCH_WANTS_ALIGNMENT 1 // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
// certain common platform (compiler+architecture combinations) to avoid these problems.
// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
// when we have to disable static alignment.
#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
#else #else
#define EIGEN_ARCH_WANTS_ALIGNMENT 0 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
#endif #endif
// EIGEN_ALIGN is the true test whether we want to align or not. It takes into account both the user choice to explicitly disable // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
// alignment (EIGEN_DONT_ALIGN) and the architecture config (EIGEN_ARCH_WANTS_ALIGNMENT). Henceforth, only EIGEN_ALIGN should be used. #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
#if EIGEN_ARCH_WANTS_ALIGNMENT && !defined(EIGEN_DONT_ALIGN) && !EIGEN_GCC3_OR_OLDER \
#define EIGEN_ALIGN 1 && !defined(__SUNPRO_CC) \
&& !defined(__QNXNTO__)
#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
#else #else
#define EIGEN_ALIGN 0 #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
#ifdef EIGEN_VECTORIZE
#error "Vectorization enabled, but our platform checks say that we don't do 16 byte alignment on this platform. If you added vectorization for another architecture, you also need to edit this platform check."
#endif #endif
#ifdef EIGEN_DONT_ALIGN
#ifndef EIGEN_DONT_ALIGN_STATICALLY
#define EIGEN_DONT_ALIGN_STATICALLY
#endif
#define EIGEN_ALIGN 0
#else
#define EIGEN_ALIGN 1
#endif
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
#define EIGEN_ALIGN_STATICALLY 1
#else
#define EIGEN_ALIGN_STATICALLY 0
#ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
#define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
#endif #endif
@ -78,30 +94,6 @@
#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
#endif #endif
/** Defines the maximal loop size to enable meta unrolling of loops.
* Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
* it does not correspond to the number of iterations or the number of instructions
*/
#ifndef EIGEN_UNROLLING_LIMIT
#define EIGEN_UNROLLING_LIMIT 100
#endif
/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
* The current value is set to generate blocks of 256x256 for float
*
* Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
*/
#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
#endif
/** Defines the maximal width of the blocks used in the triangular product and solver
* for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
*/
#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
#endif
/** Allows to disable some optimizations which might affect the accuracy of the result. /** Allows to disable some optimizations which might affect the accuracy of the result.
* Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
* They currently include: * They currently include:
@ -209,9 +201,9 @@ using Eigen::ei_cos;
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code. * vectorized and non-vectorized code.
*/ */
#if !EIGEN_ALIGN #if !EIGEN_ALIGN_STATICALLY
#define EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_ALIGN_TO_BOUNDARY(n)
#elif (defined __GNUC__) #elif (defined __GNUC__) || (defined __PGI)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#elif (defined _MSC_VER) #elif (defined _MSC_VER)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
@ -232,7 +224,7 @@ using Eigen::ei_cos;
#endif #endif
#ifndef EIGEN_STACK_ALLOCATION_LIMIT #ifndef EIGEN_STACK_ALLOCATION_LIMIT
#define EIGEN_STACK_ALLOCATION_LIMIT 1000000 #define EIGEN_STACK_ALLOCATION_LIMIT 20000
#endif #endif
#ifndef EIGEN_DEFAULT_IO_FORMAT #ifndef EIGEN_DEFAULT_IO_FORMAT

View File

@ -4,6 +4,7 @@
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
// Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com> // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -24,9 +25,28 @@
// License and a copy of the GNU General Public License along with // License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>. // Eigen. If not, see <http://www.gnu.org/licenses/>.
/*****************************************************************************
*** Platform checks for aligned malloc functions ***
*****************************************************************************/
#ifndef EIGEN_MEMORY_H #ifndef EIGEN_MEMORY_H
#define EIGEN_MEMORY_H #define EIGEN_MEMORY_H
// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
// http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
// This is true at least since glibc 2.8.
// This leaves the question how to detect 64-bit. According to this document,
// http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
&& defined(__LP64__)
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
#else
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
#endif
// FreeBSD 6 seems to have 16-byte aligned malloc // FreeBSD 6 seems to have 16-byte aligned malloc
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
@ -37,13 +57,17 @@
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
#endif #endif
#if defined(__APPLE__) || defined(_WIN64) || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #if defined(__APPLE__) \
|| defined(_WIN64) \
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
#define EIGEN_MALLOC_ALREADY_ALIGNED 1 #define EIGEN_MALLOC_ALREADY_ALIGNED 1
#else #else
#define EIGEN_MALLOC_ALREADY_ALIGNED 0 #define EIGEN_MALLOC_ALREADY_ALIGNED 0
#endif #endif
#if ((defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0) #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \
&& (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
#define EIGEN_HAS_POSIX_MEMALIGN 1 #define EIGEN_HAS_POSIX_MEMALIGN 1
#else #else
#define EIGEN_HAS_POSIX_MEMALIGN 0 #define EIGEN_HAS_POSIX_MEMALIGN 0
@ -55,26 +79,90 @@
#define EIGEN_HAS_MM_MALLOC 0 #define EIGEN_HAS_MM_MALLOC 0
#endif #endif
/** \internal like malloc, but the returned pointer is guaranteed to be 16-byte aligned. /*****************************************************************************
* Fast, but wastes 16 additional bytes of memory. *** Implementation of handmade aligned functions ***
* Does not throw any exception. *****************************************************************************/
/* ----- Hand made implementations of aligned malloc/free and realloc ----- */
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
* Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
*/ */
inline void* ei_handmade_aligned_malloc(size_t size) inline void* ei_handmade_aligned_malloc(size_t size)
{ {
void *original = std::malloc(size+16); void *original = std::malloc(size+16);
if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16); void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
*(reinterpret_cast<void**>(aligned) - 1) = original; *(reinterpret_cast<void**>(aligned) - 1) = original;
return aligned; return aligned;
} }
/** \internal frees memory allocated with ei_handmade_aligned_malloc */ /** \internal Frees memory allocated with ei_handmade_aligned_malloc */
inline void ei_handmade_aligned_free(void *ptr) inline void ei_handmade_aligned_free(void *ptr)
{ {
if(ptr) if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
std::free(*(reinterpret_cast<void**>(ptr) - 1));
} }
/** \internal allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment. /** \internal
* \brief Reallocates aligned memory.
* Since we know that our handmade version is based on std::realloc
* we can use std::realloc to implement efficient reallocation.
*/
inline void* ei_handmade_aligned_realloc(void* ptr, size_t size, size_t = 0)
{
if (ptr == 0) return ei_handmade_aligned_malloc(size);
void *original = *(reinterpret_cast<void**>(ptr) - 1);
original = std::realloc(original,size+16);
if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
*(reinterpret_cast<void**>(aligned) - 1) = original;
return aligned;
}
/*****************************************************************************
*** Implementation of generic aligned realloc (when no realloc can be used)***
*****************************************************************************/
void* ei_aligned_malloc(size_t size);
void ei_aligned_free(void *ptr);
/** \internal
* \brief Reallocates aligned memory.
* Allows reallocation with aligned ptr types. This implementation will
* always create a new memory chunk and copy the old data.
*/
inline void* ei_generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
{
if (ptr==0)
return ei_aligned_malloc(size);
if (size==0)
{
ei_aligned_free(ptr);
return 0;
}
void* newptr = ei_aligned_malloc(size);
if (newptr == 0)
{
errno = ENOMEM; // according to the standard
return 0;
}
if (ptr != 0)
{
std::memcpy(newptr, ptr, std::min(size,old_size));
ei_aligned_free(ptr);
}
return newptr;
}
/*****************************************************************************
*** Implementation of portable aligned versions of malloc/free/realloc ***
*****************************************************************************/
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
* On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown. * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
*/ */
inline void* ei_aligned_malloc(size_t size) inline void* ei_aligned_malloc(size_t size)
@ -85,9 +173,9 @@ inline void* ei_aligned_malloc(size_t size)
void *result; void *result;
#if !EIGEN_ALIGN #if !EIGEN_ALIGN
result = malloc(size); result = std::malloc(size);
#elif EIGEN_MALLOC_ALREADY_ALIGNED #elif EIGEN_MALLOC_ALREADY_ALIGNED
result = malloc(size); result = std::malloc(size);
#elif EIGEN_HAS_POSIX_MEMALIGN #elif EIGEN_HAS_POSIX_MEMALIGN
if(posix_memalign(&result, 16, size)) result = 0; if(posix_memalign(&result, 16, size)) result = 0;
#elif EIGEN_HAS_MM_MALLOC #elif EIGEN_HAS_MM_MALLOC
@ -105,7 +193,67 @@ inline void* ei_aligned_malloc(size_t size)
return result; return result;
} }
/** allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. /** \internal Frees memory allocated with ei_aligned_malloc. */
inline void ei_aligned_free(void *ptr)
{
#if !EIGEN_ALIGN
std::free(ptr);
#elif EIGEN_MALLOC_ALREADY_ALIGNED
std::free(ptr);
#elif EIGEN_HAS_POSIX_MEMALIGN
std::free(ptr);
#elif EIGEN_HAS_MM_MALLOC
_mm_free(ptr);
#elif defined(_MSC_VER)
_aligned_free(ptr);
#else
ei_handmade_aligned_free(ptr);
#endif
}
/**
* \internal
* \brief Reallocates an aligned block of memory.
* \throws std::bad_alloc if EIGEN_EXCEPTIONS are defined.
**/
inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
{
(void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
void *result;
#if !EIGEN_ALIGN
result = std::realloc(ptr,new_size);
#elif EIGEN_MALLOC_ALREADY_ALIGNED
result = std::realloc(ptr,new_size);
#elif EIGEN_HAS_POSIX_MEMALIGN
result = ei_generic_aligned_realloc(ptr,new_size,old_size);
#elif EIGEN_HAS_MM_MALLOC
// The defined(_mm_free) is just here to verify that this MSVC version
// implements _mm_malloc/_mm_free based on the corresponding _aligned_
// functions. This may not always be the case and we just try to be safe.
#if defined(_MSC_VER) && defined(_mm_free)
result = _aligned_realloc(ptr,new_size,16);
#else
result = ei_generic_aligned_realloc(ptr,new_size,old_size);
#endif
#elif defined(_MSC_VER)
result = _aligned_realloc(ptr,new_size,16);
#else
result = ei_handmade_aligned_realloc(ptr,new_size,old_size);
#endif
#ifdef EIGEN_EXCEPTIONS
if (result==0 && new_size!=0)
throw std::bad_alloc();
#endif
return result;
}
/*****************************************************************************
*** Implementation of conditionally aligned functions ***
*****************************************************************************/
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
* On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown. * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
*/ */
template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size) template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
@ -126,7 +274,32 @@ template<> inline void* ei_conditional_aligned_malloc<false>(size_t size)
return result; return result;
} }
/** \internal construct the elements of an array. /** \internal Frees memory allocated with ei_conditional_aligned_malloc */
template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
{
ei_aligned_free(ptr);
}
template<> inline void ei_conditional_aligned_free<false>(void *ptr)
{
std::free(ptr);
}
template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
{
return ei_aligned_realloc(ptr, new_size, old_size);
}
template<> inline void* ei_conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
{
return std::realloc(ptr, new_size);
}
/*****************************************************************************
*** Construction/destruction of array elements ***
*****************************************************************************/
/** \internal Constructs the elements of an array.
* The \a size parameter tells on how many objects to call the constructor of T. * The \a size parameter tells on how many objects to call the constructor of T.
*/ */
template<typename T> inline T* ei_construct_elements_of_array(T *ptr, size_t size) template<typename T> inline T* ei_construct_elements_of_array(T *ptr, size_t size)
@ -135,7 +308,20 @@ template<typename T> inline T* ei_construct_elements_of_array(T *ptr, size_t siz
return ptr; return ptr;
} }
/** allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment. /** \internal Destructs the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void ei_destruct_elements_of_array(T *ptr, size_t size)
{
// always destruct an array starting from the end.
while(size) ptr[--size].~T();
}
/*****************************************************************************
*** Implementation of aligned new/delete-like functions ***
*****************************************************************************/
/** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
* On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown. * On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown.
* The default constructor of T is called. * The default constructor of T is called.
*/ */
@ -151,47 +337,7 @@ template<typename T, bool Align> inline T* ei_conditional_aligned_new(size_t siz
return ei_construct_elements_of_array(result, size); return ei_construct_elements_of_array(result, size);
} }
/** \internal free memory allocated with ei_aligned_malloc /** \internal Deletes objects constructed with ei_aligned_new
*/
inline void ei_aligned_free(void *ptr)
{
#if !EIGEN_ALIGN
free(ptr);
#elif EIGEN_MALLOC_ALREADY_ALIGNED
free(ptr);
#elif EIGEN_HAS_POSIX_MEMALIGN
free(ptr);
#elif EIGEN_HAS_MM_MALLOC
_mm_free(ptr);
#elif defined(_MSC_VER)
_aligned_free(ptr);
#else
ei_handmade_aligned_free(ptr);
#endif
}
/** \internal free memory allocated with ei_conditional_aligned_malloc
*/
template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
{
ei_aligned_free(ptr);
}
template<> inline void ei_conditional_aligned_free<false>(void *ptr)
{
std::free(ptr);
}
/** \internal destruct the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void ei_destruct_elements_of_array(T *ptr, size_t size)
{
// always destruct an array starting from the end.
while(size) ptr[--size].~T();
}
/** \internal delete objects constructed with ei_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T. * The \a size parameters tells on how many objects to call the destructor of T.
*/ */
template<typename T> inline void ei_aligned_delete(T *ptr, size_t size) template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
@ -200,7 +346,7 @@ template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
ei_aligned_free(ptr); ei_aligned_free(ptr);
} }
/** \internal delete objects constructed with ei_conditional_aligned_new /** \internal Deletes objects constructed with ei_conditional_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T. * The \a size parameters tells on how many objects to call the destructor of T.
*/ */
template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *ptr, size_t size) template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *ptr, size_t size)
@ -209,7 +355,17 @@ template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *pt
ei_conditional_aligned_free<Align>(ptr); ei_conditional_aligned_free<Align>(ptr);
} }
/** \internal \returns the index of the first element of the array that is well aligned for vectorization. template<typename T, bool Align> inline T* ei_conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
{
T *result = reinterpret_cast<T*>(ei_conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
if (new_size > old_size)
ei_construct_elements_of_array(result+old_size, new_size-old_size);
return result;
}
/****************************************************************************/
/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
* *
* \param array the address of the start of the array * \param array the address of the start of the array
* \param size the size of the array * \param size the size of the array
@ -236,7 +392,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
if(PacketSize==1) if(PacketSize==1)
{ {
// Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
// of the array have the same aligment. // of the array have the same alignment.
return 0; return 0;
} }
else if(size_t(array) & (sizeof(Scalar)-1)) else if(size_t(array) & (sizeof(Scalar)-1))
@ -252,19 +408,23 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
} }
} }
/*****************************************************************************
*** Implementation of runtime stack allocation (falling back to malloc) ***
*****************************************************************************/
/** \internal /** \internal
* ei_aligned_stack_alloc(SIZE) allocates an aligned buffer of SIZE bytes * Allocates an aligned buffer of SIZE bytes on the stack if SIZE is smaller than
* on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and * EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
* if stack allocation is supported by the platform (currently, this is linux only). * (currently, this is Linux only). Otherwise the memory is allocated on the heap.
* Otherwise the memory is allocated on the heap. * Data allocated with ei_aligned_stack_alloc \b must be freed by calling
* Data allocated with ei_aligned_stack_alloc \b must be freed by calling ei_aligned_stack_free(PTR,SIZE). * ei_aligned_stack_free(PTR,SIZE).
* \code * \code
* float * data = ei_aligned_stack_alloc(float,array.size()); * float * data = ei_aligned_stack_alloc(float,array.size());
* // ... * // ...
* ei_aligned_stack_free(data,float,array.size()); * ei_aligned_stack_free(data,float,array.size());
* \endcode * \endcode
*/ */
#ifdef __linux__ #if (defined __linux__)
#define ei_aligned_stack_alloc(SIZE) (SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) \ #define ei_aligned_stack_alloc(SIZE) (SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) \
? alloca(SIZE) \ ? alloca(SIZE) \
: ei_aligned_malloc(SIZE) : ei_aligned_malloc(SIZE)
@ -279,6 +439,10 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
ei_aligned_stack_free(PTR,sizeof(TYPE)*SIZE);} while(0) ei_aligned_stack_free(PTR,sizeof(TYPE)*SIZE);} while(0)
/*****************************************************************************
*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] ***
*****************************************************************************/
#if EIGEN_ALIGN #if EIGEN_ALIGN
#ifdef EIGEN_EXCEPTIONS #ifdef EIGEN_EXCEPTIONS
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
@ -322,10 +486,11 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)) EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))
/****************************************************************************/
/** \class aligned_allocator /** \class aligned_allocator
* *
* \brief stl compatible allocator to use with with 16 byte aligned types * \brief STL compatible allocator to use with with 16 byte aligned types
* *
* Example: * Example:
* \code * \code

View File

@ -2,7 +2,7 @@
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -46,7 +46,7 @@
// if native static_assert is enabled, let's use it // if native static_assert is enabled, let's use it
#define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG); #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
#else // CXX0X #else // not CXX0X
template<bool condition> template<bool condition>
struct ei_static_assert {}; struct ei_static_assert {};
@ -81,7 +81,8 @@
BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
YOU_ALREADY_SPECIFIED_THIS_STRIDE
}; };
}; };

View File

@ -50,7 +50,7 @@ template<int Value> class ei_int_if_dynamic
{ {
public: public:
EIGEN_EMPTY_STRUCT_CTOR(ei_int_if_dynamic) EIGEN_EMPTY_STRUCT_CTOR(ei_int_if_dynamic)
explicit ei_int_if_dynamic(int) {} explicit ei_int_if_dynamic(int v) { EIGEN_ONLY_USED_FOR_DEBUG(v); ei_assert(v == Value); }
static int value() { return Value; } static int value() { return Value; }
void setValue(int) {} void setValue(int) {}
}; };
@ -58,7 +58,7 @@ template<int Value> class ei_int_if_dynamic
template<> class ei_int_if_dynamic<Dynamic> template<> class ei_int_if_dynamic<Dynamic>
{ {
int m_value; int m_value;
ei_int_if_dynamic() {} ei_int_if_dynamic() { ei_assert(false); }
public: public:
explicit ei_int_if_dynamic(int value) : m_value(value) {} explicit ei_int_if_dynamic(int value) : m_value(value) {}
int value() const { return m_value; } int value() const { return m_value; }
@ -87,12 +87,39 @@ class ei_compute_matrix_flags
{ {
enum { enum {
row_major_bit = Options&RowMajor ? RowMajorBit : 0, row_major_bit = Options&RowMajor ? RowMajorBit : 0,
inner_max_size = MaxCols==1 ? MaxRows is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
: MaxRows==1 ? MaxCols #if EIGEN_ALIGN_STATICALLY
: row_major_bit ? MaxCols : MaxRows, is_fixed_size_aligned
is_big = inner_max_size == Dynamic, = (!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
is_packet_size_multiple = MaxRows==Dynamic || MaxCols==Dynamic || ((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0, #else
aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0, is_fixed_size_aligned = 0,
#endif
#if EIGEN_ALIGN
is_dynamic_size_aligned = is_dynamic_size_storage,
#else
is_dynamic_size_aligned = 0,
#endif
aligned_bit =
(
((Options&DontAlign)==0)
&& (
#if EIGEN_ALIGN_STATICALLY
((!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0))
#else
0
#endif
||
#if EIGEN_ALIGN
is_dynamic_size_storage
#else
0
#endif
)
) ? AlignedBit : 0,
packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0 packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
}; };
@ -214,7 +241,7 @@ struct ei_is_reference<T&>
}; };
/** /**
* The reference selector for template expressions. The idea is that we don't * \internal The reference selector for template expressions. The idea is that we don't
* need to use references for expressions since they are light weight proxy * need to use references for expressions since they are light weight proxy
* objects which should generate no copying overhead. * objects which should generate no copying overhead.
**/ **/

View File

@ -154,6 +154,14 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
m_matT = hess.matrixH(); m_matT = hess.matrixH();
if(!skipU) m_matU = hess.matrixQ(); if(!skipU) m_matU = hess.matrixQ();
// Reduce the Hessenberg matrix m_matT to triangular form by QR iteration.
// The matrix m_matT is divided in three parts.
// Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero.
// Rows il,...,iu is the part we are working on (the active submatrix).
// Rows iu+1,...,end are already brought in triangular form.
int iu = m_matT.cols() - 1; int iu = m_matT.cols() - 1;
int il; int il;
RealScalar d,sd,sf; RealScalar d,sd,sf;
@ -164,7 +172,7 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
int iter = 0; int iter = 0;
while(true) while(true)
{ {
//locate the range in which to iterate // find iu, the bottom row of the active submatrix
while(iu > 0) while(iu > 0)
{ {
d = ei_norm1(m_matT.coeff(iu,iu)) + ei_norm1(m_matT.coeff(iu-1,iu-1)); d = ei_norm1(m_matT.coeff(iu,iu)) + ei_norm1(m_matT.coeff(iu-1,iu-1));
@ -183,10 +191,11 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
if(iter >= 30) if(iter >= 30)
{ {
// FIXME : what to do when iter==MAXITER ?? // FIXME : what to do when iter==MAXITER ??
std::cerr << "MAXITER" << std::endl; //std::cerr << "MAXITER" << std::endl;
return; return;
} }
// find il, the top row of the active submatrix
il = iu-1; il = iu-1;
while(il > 0) while(il > 0)
{ {
@ -202,16 +211,19 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
if( il != 0 ) m_matT.coeffRef(il,il-1) = Complex(0); if( il != 0 ) m_matT.coeffRef(il,il-1) = Complex(0);
// compute the shift (the normalization by sf is to avoid under/overflow) // compute the shift kappa as one of the eigenvalues of the 2x2
// diagonal block on the bottom of the active submatrix
Matrix<Scalar,2,2> t = m_matT.template block<2,2>(iu-1,iu-1); Matrix<Scalar,2,2> t = m_matT.template block<2,2>(iu-1,iu-1);
sf = t.cwiseAbs().sum(); sf = t.cwiseAbs().sum();
t /= sf; t /= sf; // the normalization by sf is to avoid under/overflow
c = t.determinant(); b = t.coeff(0,1) * t.coeff(1,0);
b = t.diagonal().sum(); c = t.coeff(0,0) - t.coeff(1,1);
disc = ei_sqrt(c*c + RealScalar(4)*b);
disc = ei_sqrt(b*b - RealScalar(4)*c);
c = t.coeff(0,0) * t.coeff(1,1) - b;
b = t.coeff(0,0) + t.coeff(1,1);
r1 = (b+disc)/RealScalar(2); r1 = (b+disc)/RealScalar(2);
r2 = (b-disc)/RealScalar(2); r2 = (b-disc)/RealScalar(2);
@ -225,6 +237,12 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
else else
kappa = sf * r2; kappa = sf * r2;
if (iter == 10 || iter == 20)
{
// exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
kappa = ei_abs(ei_real(m_matT.coeff(iu,iu-1))) + ei_abs(ei_real(m_matT.coeff(iu-1,iu-2)));
}
// perform the QR step using Givens rotations // perform the QR step using Givens rotations
PlanarRotation<Complex> rot; PlanarRotation<Complex> rot;
rot.makeGivens(m_matT.coeff(il,il) - kappa, m_matT.coeff(il+1,il)); rot.makeGivens(m_matT.coeff(il,il) - kappa, m_matT.coeff(il+1,il));
@ -246,18 +264,6 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
} }
} }
// FIXME : is it necessary ?
/*
for(int i=0 ; i<n ; i++)
for(int j=0 ; j<n ; j++)
{
if(ei_abs(ei_real(m_matT.coeff(i,j))) < eps)
ei_real_ref(m_matT.coeffRef(i,j)) = 0;
if(ei_imag(ei_abs(m_matT.coeff(i,j))) < eps)
ei_imag_ref(m_matT.coeffRef(i,j)) = 0;
}
*/
m_isInitialized = true; m_isInitialized = true;
m_matUisUptodate = !skipU; m_matUisUptodate = !skipU;
} }

View File

@ -72,8 +72,8 @@ public:
inline Transform<Scalar,Dim> operator* (const Translation<Scalar,Dim>& t) const; inline Transform<Scalar,Dim> operator* (const Translation<Scalar,Dim>& t) const;
/** Concatenates a uniform scaling and an affine transformation */ /** Concatenates a uniform scaling and an affine transformation */
template<int Dim> template<int Dim, int Mode>
inline Transform<Scalar,Dim> operator* (const Transform<Scalar,Dim>& t) const; inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim, Mode>& t) const;
/** Concatenates a uniform scaling and a linear transformation matrix */ /** Concatenates a uniform scaling and a linear transformation matrix */
// TODO returns an expression // TODO returns an expression
@ -156,4 +156,27 @@ typedef DiagonalMatrix<float, 3> AlignedScaling3f;
typedef DiagonalMatrix<double,3> AlignedScaling3d; typedef DiagonalMatrix<double,3> AlignedScaling3d;
//@} //@}
template<typename Scalar>
template<int Dim>
inline Transform<Scalar,Dim>
UniformScaling<Scalar>::operator* (const Translation<Scalar,Dim>& t) const
{
Transform<Scalar,Dim> res;
res.matrix().setZero();
res.linear().diagonal().fill(factor());
res.translation() = factor() * t.vector();
res(Dim,Dim) = Scalar(1);
return res;
}
template<typename Scalar>
template<int Dim,int Mode>
inline Transform<Scalar,Dim,Mode>
UniformScaling<Scalar>::operator* (const Transform<Scalar,Dim, Mode>& t) const
{
Transform<Scalar,Dim> res = t;
res.prescale(factor());
return res;
}
#endif // EIGEN_SCALING_H #endif // EIGEN_SCALING_H

View File

@ -69,7 +69,7 @@ template<typename Derived> struct ei_determinant_impl<Derived, 2>
template<typename Derived> struct ei_determinant_impl<Derived, 3> template<typename Derived> struct ei_determinant_impl<Derived, 3>
{ {
static typename ei_traits<Derived>::Scalar run(const Derived& m) static inline typename ei_traits<Derived>::Scalar run(const Derived& m)
{ {
return ei_bruteforce_det3_helper(m,0,1,2) return ei_bruteforce_det3_helper(m,0,1,2)
- ei_bruteforce_det3_helper(m,1,0,2) - ei_bruteforce_det3_helper(m,1,0,2)
@ -100,8 +100,7 @@ inline typename ei_traits<Derived>::Scalar MatrixBase<Derived>::determinant() co
{ {
assert(rows() == cols()); assert(rows() == cols());
typedef typename ei_nested<Derived,Base::RowsAtCompileTime>::type Nested; typedef typename ei_nested<Derived,Base::RowsAtCompileTime>::type Nested;
Nested nested(derived()); return ei_determinant_impl<typename ei_cleantype<Nested>::type>::run(derived());
return ei_determinant_impl<typename ei_cleantype<Nested>::type>::run(nested);
} }
#endif // EIGEN_DETERMINANT_H #endif // EIGEN_DETERMINANT_H

View File

@ -361,6 +361,8 @@ template<typename _MatrixType> class FullPivLU
(*this, MatrixType::Identity(m_lu.rows(), m_lu.cols())); (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
} }
MatrixType reconstructedMatrix() const;
inline int rows() const { return m_lu.rows(); } inline int rows() const { return m_lu.rows(); }
inline int cols() const { return m_lu.cols(); } inline int cols() const { return m_lu.cols(); }
@ -404,6 +406,7 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
m_maxpivot = RealScalar(0); m_maxpivot = RealScalar(0);
RealScalar cutoff(0);
for(int k = 0; k < size; ++k) for(int k = 0; k < size; ++k)
{ {
@ -418,8 +421,14 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner, row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
col_of_biggest_in_corner += k; // need to add k to them. col_of_biggest_in_corner += k; // need to add k to them.
// if the pivot (hence the corner) is exactly zero, terminate to avoid generating nan/inf values // when k==0, biggest_in_corner is the biggest coeff absolute value in the original matrix
if(biggest_in_corner == RealScalar(0)) if(k == 0) cutoff = biggest_in_corner * NumTraits<Scalar>::epsilon();
// if the pivot (hence the corner) is "zero", terminate to avoid generating nan/inf values.
// Notice that using an exact comparison (biggest_in_corner==0) here, as Golub-van Loan do in
// their pseudo-code, results in numerical instability! The cutoff here has been validated
// by running the unit test 'lu' with many repetitions.
if(biggest_in_corner < cutoff)
{ {
// before exiting, make sure to initialize the still uninitialized transpositions // before exiting, make sure to initialize the still uninitialized transpositions
// in a sane state without destroying what we already have. // in a sane state without destroying what we already have.
@ -480,6 +489,31 @@ typename ei_traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant() cons
return Scalar(m_det_pq) * Scalar(m_lu.diagonal().prod()); return Scalar(m_det_pq) * Scalar(m_lu.diagonal().prod());
} }
/** \returns the matrix represented by the decomposition,
* i.e., it returns the product: P^{-1} L U Q^{-1}.
* This function is provided for debug purpose. */
template<typename MatrixType>
MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
{
ei_assert(m_isInitialized && "LU is not initialized.");
const int smalldim = std::min(m_lu.rows(), m_lu.cols());
// LU
MatrixType res(m_lu.rows(),m_lu.cols());
// FIXME the .toDenseMatrix() should not be needed...
res = m_lu.corner(TopLeft,m_lu.rows(),smalldim)
.template triangularView<UnitLower>().toDenseMatrix()
* m_lu.corner(TopLeft,smalldim,m_lu.cols())
.template triangularView<Upper>().toDenseMatrix();
// P^{-1}(LU)
res = m_p.inverse() * res;
// (P^{-1}LU)Q^{-1}
res = res * m_q.inverse();
return res;
}
/********* Implementation of kernel() **************************************************/ /********* Implementation of kernel() **************************************************/
template<typename _MatrixType> template<typename _MatrixType>

View File

@ -123,7 +123,7 @@ struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
****************************/ ****************************/
template<typename MatrixType, typename ResultType> template<typename MatrixType, typename ResultType>
void ei_compute_inverse_size3_helper( inline void ei_compute_inverse_size3_helper(
const MatrixType& matrix, const MatrixType& matrix,
const typename ResultType::Scalar& invdet, const typename ResultType::Scalar& invdet,
const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0, const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,

View File

@ -165,6 +165,8 @@ template<typename _MatrixType> class PartialPivLU
*/ */
typename ei_traits<MatrixType>::Scalar determinant() const; typename ei_traits<MatrixType>::Scalar determinant() const;
MatrixType reconstructedMatrix() const;
inline int rows() const { return m_lu.rows(); } inline int rows() const { return m_lu.rows(); }
inline int cols() const { return m_lu.cols(); } inline int cols() const { return m_lu.cols(); }
@ -194,7 +196,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
compute(matrix); compute(matrix);
} }
/** This is the blocked version of ei_fullpivlu_unblocked() */ /** \internal This is the blocked version of ei_fullpivlu_unblocked() */
template<typename Scalar, int StorageOrder> template<typename Scalar, int StorageOrder>
struct ei_partial_lu_impl struct ei_partial_lu_impl
{ {
@ -368,7 +370,7 @@ void ei_partial_lu_inplace(MatrixType& lu, IntVector& row_transpositions, int& n
ei_partial_lu_impl ei_partial_lu_impl
<typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor>
::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.stride(), &row_transpositions.coeffRef(0), nb_transpositions); ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
} }
template<typename MatrixType> template<typename MatrixType>
@ -400,6 +402,23 @@ typename ei_traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() c
return Scalar(m_det_p) * m_lu.diagonal().prod(); return Scalar(m_det_p) * m_lu.diagonal().prod();
} }
/** \returns the matrix represented by the decomposition,
* i.e., it returns the product: P^{-1} L U.
* This function is provided for debug purpose. */
template<typename MatrixType>
MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
{
ei_assert(m_isInitialized && "LU is not initialized.");
// LU
MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix()
* m_lu.template triangularView<Upper>();
// P^{-1}(LU)
res = m_p.inverse() * res;
return res;
}
/***** Implementation of solve() *****************************************************/ /***** Implementation of solve() *****************************************************/
template<typename _MatrixType, typename Rhs> template<typename _MatrixType, typename Rhs>

View File

@ -99,7 +99,7 @@ cholmod_dense ei_cholmod_map_eigen_to_dense(MatrixBase<Derived>& mat)
res.nrow = mat.rows(); res.nrow = mat.rows();
res.ncol = mat.cols(); res.ncol = mat.cols();
res.nzmax = res.nrow * res.ncol; res.nzmax = res.nrow * res.ncol;
res.d = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().stride(); res.d = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().outerStride();
res.x = mat.derived().data(); res.x = mat.derived().data();
res.z = 0; res.z = 0;
@ -233,7 +233,7 @@ bool SparseLLT<MatrixType,Cholmod>::solveInPlace(MatrixBase<Derived> &b) const
cholmod_dense* x = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &cdb, &m_cholmod); cholmod_dense* x = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &cdb, &m_cholmod);
if(!x) if(!x)
{ {
std::cerr << "Eigen: cholmod_solve failed\n"; //std::cerr << "Eigen: cholmod_solve failed\n";
return false; return false;
} }
b = Matrix<typename Base::Scalar,Dynamic,1>::Map(reinterpret_cast<typename Base::Scalar*>(x->x),b.rows()); b = Matrix<typename Base::Scalar,Dynamic,1>::Map(reinterpret_cast<typename Base::Scalar*>(x->x),b.rows());

View File

@ -236,7 +236,7 @@ class DynamicSparseMatrix
{ {
// remove all coefficients with innerCoord>=innerSize // remove all coefficients with innerCoord>=innerSize
// TODO // TODO
std::cerr << "not implemented yet\n"; //std::cerr << "not implemented yet\n";
exit(2); exit(2);
} }
if (m_data.size() != outerSize) if (m_data.size() != outerSize)

View File

@ -161,7 +161,7 @@ struct SluMatrix : SuperMatrix
res.nrow = mat.rows(); res.nrow = mat.rows();
res.ncol = mat.cols(); res.ncol = mat.cols();
res.storage.lda = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.stride(); res.storage.lda = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
res.storage.values = mat.data(); res.storage.values = mat.data();
return res; return res;
} }
@ -217,7 +217,7 @@ struct SluMatrixMapHelper<Matrix<Scalar,Rows,Cols,Options,MRows,MCols> >
res.nrow = mat.rows(); res.nrow = mat.rows();
res.ncol = mat.cols(); res.ncol = mat.cols();
res.storage.lda = mat.stride(); res.storage.lda = mat.outerStride();
res.storage.values = mat.data(); res.storage.values = mat.data();
} }
}; };
@ -397,7 +397,7 @@ void SparseLU<MatrixType,SuperLU>::compute(const MatrixType& a)
case MinimumDegree_ATA : m_sluOptions.ColPerm = MMD_ATA; break; case MinimumDegree_ATA : m_sluOptions.ColPerm = MMD_ATA; break;
case ColApproxMinimumDegree : m_sluOptions.ColPerm = COLAMD; break; case ColApproxMinimumDegree : m_sluOptions.ColPerm = COLAMD; break;
default: default:
std::cerr << "Eigen: ordering method \"" << Base::orderingMethod() << "\" not supported by the SuperLU backend\n"; //std::cerr << "Eigen: ordering method \"" << Base::orderingMethod() << "\" not supported by the SuperLU backend\n";
m_sluOptions.ColPerm = NATURAL; m_sluOptions.ColPerm = NATURAL;
}; };
@ -448,7 +448,7 @@ void SparseLU<MatrixType,SuperLU>::compute(const MatrixType& a)
&recip_pivot_gross, &rcond, &recip_pivot_gross, &rcond,
&m_sluStat, &info, Scalar()); &m_sluStat, &info, Scalar());
#else #else
std::cerr << "Incomplete factorization is only available in SuperLU v4\n"; //std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
Base::m_succeeded = false; Base::m_succeeded = false;
return; return;
#endif #endif
@ -486,7 +486,7 @@ bool SparseLU<MatrixType,SuperLU>::solve(const MatrixBase<BDerived> &b,
case SvTranspose : m_sluOptions.Trans = TRANS; break; case SvTranspose : m_sluOptions.Trans = TRANS; break;
case SvAdjoint : m_sluOptions.Trans = CONJ; break; case SvAdjoint : m_sluOptions.Trans = CONJ; break;
default: default:
std::cerr << "Eigen: transposition option \"" << transposed << "\" not supported by the SuperLU backend\n"; //std::cerr << "Eigen: transposition option \"" << transposed << "\" not supported by the SuperLU backend\n";
m_sluOptions.Trans = NOTRANS; m_sluOptions.Trans = NOTRANS;
} }
@ -513,7 +513,7 @@ bool SparseLU<MatrixType,SuperLU>::solve(const MatrixBase<BDerived> &b,
&recip_pivot_gross, &rcond, &recip_pivot_gross, &rcond,
&m_sluStat, &info, Scalar()); &m_sluStat, &info, Scalar());
#else #else
std::cerr << "Incomplete factorization is only available in SuperLU v4\n"; //std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
return false; return false;
#endif #endif
} }

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> // Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
@ -27,18 +27,20 @@
#define EIGEN_BENCH_TIMERR_H #define EIGEN_BENCH_TIMERR_H
#if defined(_WIN32) || defined(__CYGWIN__) #if defined(_WIN32) || defined(__CYGWIN__)
# ifndef NOMINMAX
# define NOMINMAX # define NOMINMAX
# define EIGEN_BT_UNDEF_NOMINMAX
# endif
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN
# define EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
# endif
# include <windows.h> # include <windows.h>
#else #else
#include <sys/time.h>
#include <time.h>
# include <unistd.h> # include <unistd.h>
#endif #endif
#include <cmath> #include <Eigen/Core>
#include <cstdlib>
#include <numeric>
namespace Eigen namespace Eigen
{ {
@ -130,10 +132,9 @@ public:
GetSystemTime(&st); GetSystemTime(&st);
return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds; return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
#else #else
struct timeval tv; timespec ts;
struct timezone tz; clock_gettime(CLOCK_REALTIME, &ts);
gettimeofday(&tv, &tz); return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
#endif #endif
} }
@ -161,4 +162,15 @@ protected:
} }
// clean #defined tokens
#ifdef EIGEN_BT_UNDEF_NOMINMAX
# undef EIGEN_BT_UNDEF_NOMINMAX
# undef NOMINMAX
#endif
#ifdef EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
# undef EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
# undef WIN32_LEAN_AND_MEAN
#endif
#endif // EIGEN_BENCH_TIMERR_H #endif // EIGEN_BENCH_TIMERR_H

125
bench/bench_gemm.cpp Normal file
View File

@ -0,0 +1,125 @@
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
#include <Eigen/Core>
#include <iostream>
#include <bench/BenchTimer.h>
using namespace std;
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
typedef SCALAR Scalar;
typedef Matrix<Scalar,Dynamic,Dynamic> M;
#ifdef HAVE_BLAS
extern "C" {
#include <bench/btl/libs/C_BLAS/blas.h>
}
static float fone = 1;
static float fzero = 0;
static double done = 1;
static double szero = 0;
static char notrans = 'N';
static char trans = 'T';
static char nonunit = 'N';
static char lower = 'L';
static char right = 'R';
static int intone = 1;
void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
{
int M = c.rows(); int N = c.cols(); int K = a.cols();
int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
const_cast<float*>(a.data()),&lda,
const_cast<float*>(b.data()),&ldb,&fone,
c.data(),&ldc);
}
void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
{
int M = c.rows(); int N = c.cols(); int K = a.cols();
int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
dgemm_(&notrans,&notrans,&M,&N,&K,&done,
const_cast<double*>(a.data()),&lda,
const_cast<double*>(b.data()),&ldb,&done,
c.data(),&ldc);
}
#endif
void gemm(const M& a, const M& b, M& c)
{
c.noalias() += a * b;
}
int main(int argc, char ** argv)
{
int rep = 1; // number of repetitions per try
int tries = 5; // number of tries, we keep the best
int s = argc==2 ? std::atoi(argv[1]) : 2048;
std::cout << "Matrix size = " << s << "\n";
int m = s;
int n = s;
int p = s;
M a(m,n); a.setRandom();
M b(n,p); b.setRandom();
M c(m,p); c.setOnes();
M r = c;
// check the parallel product is correct
#ifdef EIGEN_HAS_OPENMP
int procs = omp_get_max_threads();
if(procs>1)
{
#ifdef HAVE_BLAS
blas_gemm(a,b,r);
#else
omp_set_num_threads(1);
r.noalias() += a * b;
omp_set_num_threads(procs);
#endif
c.noalias() += a * b;
if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
}
#endif
#ifdef HAVE_BLAS
BenchTimer tblas;
BENCH(tblas, tries, rep, blas_gemm(a,b,c));
std::cout << "blas cpu " << tblas.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(CPU_TIMER) << "s)\n";
std::cout << "blas real " << tblas.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
#endif
BenchTimer tmt;
BENCH(tmt, tries, rep, gemm(a,b,c));
std::cout << "eigen cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
std::cout << "eigen real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
#ifdef EIGEN_HAS_OPENMP
if(procs>1)
{
BenchTimer tmono;
omp_set_num_threads(1);
BENCH(tmono, tries, rep, gemm(a,b,c));
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
std::cout << "eigen mono real " << tmono.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER) << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
}
#endif
return 0;
}

View File

@ -70,7 +70,7 @@ MACRO(BTL_ADD_BENCH targetname)
IF(BUILD_${targetname}) IF(BUILD_${targetname})
ADD_EXECUTABLE(${targetname} ${_sources}) ADD_EXECUTABLE(${targetname} ${_sources})
ADD_TEST(${targetname} "${targetname}") ADD_TEST(${targetname} "${targetname}")
target_link_libraries(${targetname} ${DEFAULT_LIBRARIES}) target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} rt)
ENDIF(BUILD_${targetname}) ENDIF(BUILD_${targetname})
ENDMACRO(BTL_ADD_BENCH) ENDMACRO(BTL_ADD_BENCH)

View File

@ -76,7 +76,7 @@ public :
static inline std::string name( void ) static inline std::string name( void )
{ {
return "lu_decomp_"+Interface::name(); return "complete_lu_decomp_"+Interface::name();
} }
double nb_op_base( void ){ double nb_op_base( void ){

View File

@ -8,7 +8,7 @@ matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000
trisolve ; "{/*1.5 triangular solver (X = inv(L) X)}" ; "size" ; 4:3000 trisolve ; "{/*1.5 triangular solver (X = inv(L) X)}" ; "size" ; 4:3000
matrix_trisolve ; "{/*1.5 matrix triangular solver (M = inv(L) M)}" ; "size" ; 4:3000 matrix_trisolve ; "{/*1.5 matrix triangular solver (M = inv(L) M)}" ; "size" ; 4:3000
cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000 cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000
lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000 complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000 partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000
tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000 tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000
hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000 hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000

View File

@ -41,7 +41,7 @@ source mk_mean_script.sh ata $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh trisolve $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh trisolve $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh matrix_trisolve $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh matrix_trisolve $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh cholesky $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh cholesky $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh complete_lu_decomp $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh partial_lu_decomp $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh partial_lu_decomp $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $mode $prefix
source mk_mean_script.sh hessenberg $1 11 100 300 1000 $mode $prefix source mk_mean_script.sh hessenberg $1 11 100 300 1000 $mode $prefix

View File

@ -1,6 +1,6 @@
eigen2 ; with lines lw 4 lt 1 lc rgbcolor "black" eigen3 ; with lines lw 4 lt 1 lc rgbcolor "black"
eigen2_novec ; with lines lw 2 lt 1 lc rgbcolor "#999999" eigen3_novec ; with lines lw 2 lt 1 lc rgbcolor "#999999"
eigen2_nogccvec ; with lines lw 2 lt 2 lc rgbcolor "#991010" eigen3_nogccvec ; with lines lw 2 lt 2 lc rgbcolor "#991010"
INTEL_MKL ; with lines lw 3 lt 2 lc rgbcolor "#00b7ff" INTEL_MKL ; with lines lw 3 lt 2 lc rgbcolor "#00b7ff"
ATLAS ; with lines lw 3 lt 1 lc rgbcolor "#52e657" ATLAS ; with lines lw 3 lt 1 lc rgbcolor "#52e657"
gmm ; with lines lw 3 lt 1 lc rgbcolor "#0000ff" gmm ; with lines lw 3 lt 1 lc rgbcolor "#0000ff"

View File

@ -23,7 +23,7 @@
// minimal time for each measurement // minimal time for each measurement
#define REAL_TYPE float #define REAL_TYPE float
// minimal time for each measurement // minimal time for each measurement
#define MIN_TIME 0.5 #define MIN_TIME 0.2
// nb of point on bench curves // nb of point on bench curves
#define NB_POINT 100 #define NB_POINT 100
// min vector size for axpy bench // min vector size for axpy bench
@ -48,6 +48,6 @@
#define DEFAULT_NB_SAMPLE 1000 #define DEFAULT_NB_SAMPLE 1000
// how many times we run a single bench (keep the best perf) // how many times we run a single bench (keep the best perf)
#define NB_TRIES 5 #define DEFAULT_NB_TRIES 3
#endif #endif

View File

@ -169,7 +169,7 @@ class BtlConfig
{ {
public: public:
BtlConfig() BtlConfig()
: overwriteResults(false), checkResults(true) : overwriteResults(false), checkResults(true), realclock(false), tries(DEFAULT_NB_TRIES)
{ {
char * _config; char * _config;
_config = getenv ("BTL_CONFIG"); _config = getenv ("BTL_CONFIG");
@ -189,6 +189,17 @@ public:
i += 1; i += 1;
} }
else if (config[i].beginsWith("-t"))
{
if (i+1==config.size())
{
std::cerr << "error processing option: " << config[i] << "\n";
exit(2);
}
Instance.tries = atoi(config[i+1].c_str());
i += 1;
}
else if (config[i].beginsWith("--overwrite")) else if (config[i].beginsWith("--overwrite"))
{ {
Instance.overwriteResults = true; Instance.overwriteResults = true;
@ -197,6 +208,10 @@ public:
{ {
Instance.checkResults = false; Instance.checkResults = false;
} }
else if (config[i].beginsWith("--real"))
{
Instance.realclock = true;
}
} }
} }
@ -219,6 +234,8 @@ public:
static BtlConfig Instance; static BtlConfig Instance;
bool overwriteResults; bool overwriteResults;
bool checkResults; bool checkResults;
bool realclock;
int tries;
protected: protected:
std::vector<BtlString> m_selectedActionNames; std::vector<BtlString> m_selectedActionNames;

View File

@ -27,14 +27,14 @@
template <class Action> template <class Action>
class Portable_Perf_Analyzer{ class Portable_Perf_Analyzer{
public: public:
Portable_Perf_Analyzer( void ):_nb_calc(1),_chronos(){ Portable_Perf_Analyzer( ):_nb_calc(0), m_time_action(0), _chronos(){
MESSAGE("Portable_Perf_Analyzer Ctor"); MESSAGE("Portable_Perf_Analyzer Ctor");
}; };
Portable_Perf_Analyzer( const Portable_Perf_Analyzer & ){ Portable_Perf_Analyzer( const Portable_Perf_Analyzer & ){
INFOS("Copy Ctor not implemented"); INFOS("Copy Ctor not implemented");
exit(0); exit(0);
}; };
~Portable_Perf_Analyzer( void ){ ~Portable_Perf_Analyzer(){
MESSAGE("Portable_Perf_Analyzer Dtor"); MESSAGE("Portable_Perf_Analyzer Dtor");
}; };
@ -42,26 +42,26 @@ public:
{ {
Action action(size); Action action(size);
double time_action = 0; // action.initialize();
action.initialize(); // time_action = time_calculate(action);
time_action = time_calculate(action); while (m_time_action < MIN_TIME)
while (time_action < MIN_TIME)
{ {
_nb_calc *= 2; if(_nb_calc==0) _nb_calc = 1;
else _nb_calc *= 2;
action.initialize(); action.initialize();
time_action = time_calculate(action); m_time_action = time_calculate(action);
} }
// optimize // optimize
for (int i=1; i<NB_TRIES; ++i) for (int i=1; i<BtlConfig::Instance.tries; ++i)
{ {
Action _action(size); Action _action(size);
std::cout << " " << _action.nb_op_base()*_nb_calc/(time_action*1e6) << " "; std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action*1e6) << " ";
_action.initialize(); _action.initialize();
time_action = std::min(time_action, time_calculate(_action)); m_time_action = std::min(m_time_action, time_calculate(_action));
} }
time_action = time_action / (double(_nb_calc)); double time_action = m_time_action / (double(_nb_calc));
// check // check
if (BtlConfig::Instance.checkResults && size<128) if (BtlConfig::Instance.checkResults && size<128)
@ -70,7 +70,7 @@ public:
action.calculate(); action.calculate();
action.check_result(); action.check_result();
} }
return action.nb_op_base()/(time_action*1000000.0); return action.nb_op_base()/(time_action*1e6);
} }
BTL_DONT_INLINE double time_calculate(Action & action) BTL_DONT_INLINE double time_calculate(Action & action)
@ -86,7 +86,7 @@ public:
return _chronos.user_time(); return _chronos.user_time();
} }
unsigned long long get_nb_calc( void ) unsigned long long get_nb_calc()
{ {
return _nb_calc; return _nb_calc;
} }
@ -94,6 +94,7 @@ public:
private: private:
unsigned long long _nb_calc; unsigned long long _nb_calc;
double m_time_action;
Portable_Timer _chronos; Portable_Timer _chronos;
}; };

View File

@ -98,65 +98,45 @@ class Portable_Timer
{ {
public: public:
Portable_Timer( void ):_utime_sec_start(-1), Portable_Timer()
_utime_usec_start(-1),
_utime_sec_stop(-1),
_utime_usec_stop(-1)/*,
m_prev_cs(-1)*/
{ {
m_clkid = BtlConfig::Instance.realclock ? CLOCK_REALTIME : CLOCK_PROCESS_CPUTIME_ID;
} }
Portable_Timer(int clkid) : m_clkid(clkid)
{}
void start() void start()
{ {
int status=getrusage(RUSAGE_SELF, &resourcesUsage) ; timespec ts;
// _start_time = std::clock(); clock_gettime(m_clkid, &ts);
_utime_sec_start = resourcesUsage.ru_utime.tv_sec ; m_start_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
_utime_usec_start = resourcesUsage.ru_utime.tv_usec ;
// m_prev_cs = resourcesUsage.ru_nivcsw;
} }
void stop() void stop()
{ {
int status=getrusage(RUSAGE_SELF, &resourcesUsage) ; timespec ts;
// _stop_time = std::clock(); clock_gettime(m_clkid, &ts);
_utime_sec_stop = resourcesUsage.ru_utime.tv_sec ; m_stop_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
_utime_usec_stop = resourcesUsage.ru_utime.tv_usec ;
// m_prev_cs = resourcesUsage.ru_nivcsw - m_prev_cs;
// std::cerr << resourcesUsage.ru_nvcsw << " + " << resourcesUsage.ru_nivcsw << "\n";
} }
double elapsed() double elapsed()
{ {
return user_time();//double(_stop_time - _start_time) / CLOCKS_PER_SEC; return user_time();
} }
double user_time() double user_time()
{ {
// std::cout << m_prev_cs << "\n"; return m_stop_time - m_start_time;
long tot_utime_sec=_utime_sec_stop-_utime_sec_start;
long tot_utime_usec=_utime_usec_stop-_utime_usec_start;
return double(tot_utime_sec)+ double(tot_utime_usec)/double(USEC_IN_SEC) ;
} }
private: private:
struct rusage resourcesUsage ; int m_clkid;
double m_stop_time, m_start_time;
long _utime_sec_start ;
long _utime_usec_start ;
long _utime_sec_stop ;
long _utime_usec_stop ;
// long m_prev_cs;
std::clock_t _start_time;
std::clock_t _stop_time;
}; // Portable_Timer }; // Portable_Timer

View File

@ -1,57 +1,57 @@
# find_package(MKL)
find_package(Eigen2) find_package(Eigen2)
if (EIGEN2_FOUND) if (EIGEN2_FOUND)
include_directories(${EIGEN2_INCLUDE_DIR}) include_directories(${EIGEN2_INCLUDE_DIR})
btl_add_bench(btl_eigen2_linear main_linear.cpp) btl_add_bench(btl_eigen3_linear main_linear.cpp)
btl_add_bench(btl_eigen2_vecmat main_vecmat.cpp) btl_add_bench(btl_eigen3_vecmat main_vecmat.cpp)
btl_add_bench(btl_eigen2_matmat main_matmat.cpp) btl_add_bench(btl_eigen3_matmat main_matmat.cpp)
btl_add_bench(btl_eigen2_adv main_adv.cpp ) btl_add_bench(btl_eigen3_adv main_adv.cpp )
btl_add_target_property(btl_eigen2_linear COMPILE_FLAGS "-DBTL_PREFIX=eigen2") btl_add_target_property(btl_eigen3_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
btl_add_target_property(btl_eigen2_vecmat COMPILE_FLAGS "-DBTL_PREFIX=eigen2") btl_add_target_property(btl_eigen3_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
btl_add_target_property(btl_eigen2_matmat COMPILE_FLAGS "-DBTL_PREFIX=eigen2") btl_add_target_property(btl_eigen3_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
btl_add_target_property(btl_eigen2_adv COMPILE_FLAGS "-DBTL_PREFIX=eigen2") btl_add_target_property(btl_eigen3_adv COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF) option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC) if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
btl_add_bench(btl_eigen2_nogccvec_linear main_linear.cpp) btl_add_bench(btl_eigen3_nogccvec_linear main_linear.cpp)
btl_add_bench(btl_eigen2_nogccvec_vecmat main_vecmat.cpp) btl_add_bench(btl_eigen3_nogccvec_vecmat main_vecmat.cpp)
btl_add_bench(btl_eigen2_nogccvec_matmat main_matmat.cpp) btl_add_bench(btl_eigen3_nogccvec_matmat main_matmat.cpp)
btl_add_bench(btl_eigen2_nogccvec_adv main_adv.cpp ) btl_add_bench(btl_eigen3_nogccvec_adv main_adv.cpp )
btl_add_target_property(btl_eigen2_nogccvec_linear COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec") btl_add_target_property(btl_eigen3_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
btl_add_target_property(btl_eigen2_nogccvec_vecmat COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec") btl_add_target_property(btl_eigen3_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
btl_add_target_property(btl_eigen2_nogccvec_matmat COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec") btl_add_target_property(btl_eigen3_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
btl_add_target_property(btl_eigen2_nogccvec_adv COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec") btl_add_target_property(btl_eigen3_nogccvec_adv COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
endif() endif()
if(NOT BTL_NOVEC) if(NOT BTL_NOVEC)
btl_add_bench(btl_eigen2_novec_linear main_linear.cpp) btl_add_bench(btl_eigen3_novec_linear main_linear.cpp)
btl_add_bench(btl_eigen2_novec_vecmat main_vecmat.cpp) btl_add_bench(btl_eigen3_novec_vecmat main_vecmat.cpp)
btl_add_bench(btl_eigen2_novec_matmat main_matmat.cpp) btl_add_bench(btl_eigen3_novec_matmat main_matmat.cpp)
btl_add_bench(btl_eigen2_novec_adv main_adv.cpp ) btl_add_bench(btl_eigen3_novec_adv main_adv.cpp )
btl_add_target_property(btl_eigen2_novec_linear COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec") btl_add_target_property(btl_eigen3_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
btl_add_target_property(btl_eigen2_novec_vecmat COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec") btl_add_target_property(btl_eigen3_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
btl_add_target_property(btl_eigen2_novec_matmat COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec") btl_add_target_property(btl_eigen3_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
btl_add_target_property(btl_eigen2_novec_adv COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec") btl_add_target_property(btl_eigen3_novec_adv COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
# if(BUILD_btl_eigen2_adv) # if(BUILD_btl_eigen3_adv)
# target_link_libraries(btl_eigen2_adv ${MKL_LIBRARIES}) # target_link_libraries(btl_eigen3_adv ${MKL_LIBRARIES})
# endif(BUILD_btl_eigen2_adv) # endif(BUILD_btl_eigen3_adv)
endif(NOT BTL_NOVEC) endif(NOT BTL_NOVEC)
btl_add_bench(btl_tiny_eigen2 btl_tiny_eigen2.cpp OFF) btl_add_bench(btl_tiny_eigen3 btl_tiny_eigen3.cpp OFF)
if(NOT BTL_NOVEC) if(NOT BTL_NOVEC)
btl_add_bench(btl_tiny_eigen2_novec btl_tiny_eigen2.cpp OFF) btl_add_bench(btl_tiny_eigen3_novec btl_tiny_eigen3.cpp OFF)
btl_add_target_property(btl_tiny_eigen2_novec COMPILE_FLAGS "-DBTL_PREFIX=eigen2_tiny") btl_add_target_property(btl_tiny_eigen3_novec COMPILE_FLAGS "-DBTL_PREFIX=eigen3_tiny")
if(BUILD_btl_tiny_eigen2_novec) if(BUILD_btl_tiny_eigen3_novec)
btl_add_target_property(btl_tiny_eigen2_novec COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_tiny_novec") btl_add_target_property(btl_tiny_eigen3_novec COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_tiny_novec")
endif(BUILD_btl_tiny_eigen2_novec) endif(BUILD_btl_tiny_eigen3_novec)
endif(NOT BTL_NOVEC) endif(NOT BTL_NOVEC)
endif (EIGEN2_FOUND) endif (EIGEN2_FOUND)

View File

@ -17,11 +17,8 @@
// //
#ifndef EIGEN2_INTERFACE_HH #ifndef EIGEN2_INTERFACE_HH
#define EIGEN2_INTERFACE_HH #define EIGEN2_INTERFACE_HH
// #include <cblas.h>
#include <Eigen/Array> #include <Eigen/Eigen>
#include <Eigen/Cholesky>
#include <Eigen/LU>
#include <Eigen/QR>
#include <vector> #include <vector>
#include "btl.hh" #include "btl.hh"
@ -88,27 +85,27 @@ public :
} }
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
X = (A*B).lazy(); X.noalias() = A*B;
} }
static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){ static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
X = (A.transpose()*B.transpose()).lazy(); X.noalias() = A.transpose()*B.transpose();
} }
static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){ static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
X = (A.transpose()*A).lazy(); X.noalias() = A.transpose()*A;
} }
static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){ static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
X = (A*A.transpose()).lazy(); X.noalias() = A*A.transpose();
} }
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
X = (A*B).lazy(); X.noalias() = A*B;
} }
static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){ static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
X = (A.template selfadjointView<LowerTriangular>() * B)/*.lazy()*/; X.noalias() = (A.template selfadjointView<Lower>() * B);
// ei_product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1); // ei_product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
} }
@ -169,11 +166,11 @@ public :
} }
static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int N){ static EIGEN_DONT_INLINE void rot(gene_vector & A, gene_vector & B, real c, real s, int N){
ei_apply_rotation_in_the_plane(A, B, c, s); ei_apply_rotation_in_the_plane(A, B, PlanarRotation<real>(c,s));
} }
static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
X = (A.transpose()*B).lazy(); X.noalias() = (A.transpose()*B);
} }
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){ static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
@ -193,16 +190,16 @@ public :
} }
static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){ static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
X = L.template triangularView<LowerTriangular>().solve(B); X = L.template triangularView<Lower>().solve(B);
} }
static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){ static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
X = L.template triangularView<LowerTriangular>().solve(B); X = L.template triangularView<Lower>().solve(B);
} }
static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){ static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
C = X; C = X;
ei_llt_inplace<LowerTriangular>::blocked(C); ei_llt_inplace<Lower>::blocked(C);
//C = X.llt().matrixL(); //C = X.llt().matrixL();
// C = X; // C = X;
// Cholesky<gene_matrix>::computeInPlace(C); // Cholesky<gene_matrix>::computeInPlace(C);
@ -210,15 +207,15 @@ public :
} }
static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){ static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
C = X.fullPivLu().matrixLU();
}
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
RowVectorXi piv(N); RowVectorXi piv(N);
int nb; int nb;
C = X; C = X;
ei_partial_lu_inplace(C,piv,nb); ei_partial_lu_inplace(C,piv,nb);
//C = X.lu().matrixLU(); // C = X.partialPivLu().matrixLU();
}
static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
C = X.partialPivLu().matrixLU();
} }
static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){ static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){

143
bench/product_threshold.cpp Normal file
View File

@ -0,0 +1,143 @@
#include <iostream>
#include <Eigen/Core>
#include <bench/BenchTimer.h>
using namespace Eigen;
using namespace std;
#define END 9
template<int S> struct map_size { enum { ret = S }; };
template<> struct map_size<10> { enum { ret = 20 }; };
template<> struct map_size<11> { enum { ret = 50 }; };
template<> struct map_size<12> { enum { ret = 100 }; };
template<> struct map_size<13> { enum { ret = 300 }; };
template<int M, int N,int K> struct alt_prod
{
enum {
ret = M==1 && N==1 ? InnerProduct
: K==1 ? OuterProduct
: M==1 ? GemvProduct
: N==1 ? GemvProduct
: GemmProduct
};
};
void print_mode(int mode)
{
if(mode==InnerProduct) std::cout << "i";
if(mode==OuterProduct) std::cout << "o";
if(mode==CoeffBasedProductMode) std::cout << "c";
if(mode==LazyCoeffBasedProductMode) std::cout << "l";
if(mode==GemvProduct) std::cout << "v";
if(mode==GemmProduct) std::cout << "m";
}
template<int Mode, typename Lhs, typename Rhs, typename Res>
EIGEN_DONT_INLINE void prod(const Lhs& a, const Rhs& b, Res& c)
{
c.noalias() += typename ProductReturnType<Lhs,Rhs,Mode>::Type(a,b);
}
template<int M, int N, int K, typename Scalar, int Mode>
EIGEN_DONT_INLINE void bench_prod()
{
typedef Matrix<Scalar,M,K> Lhs; Lhs a; a.setRandom();
typedef Matrix<Scalar,K,N> Rhs; Rhs b; b.setRandom();
typedef Matrix<Scalar,M,N> Res; Res c; c.setRandom();
BenchTimer t;
double n = 2.*double(M)*double(N)*double(K);
int rep = 100000./n;
rep /= 2;
if(rep<1) rep = 1;
do {
rep *= 2;
t.reset();
BENCH(t,1,rep,prod<CoeffBasedProductMode>(a,b,c));
} while(t.best()<0.1);
t.reset();
BENCH(t,5,rep,prod<Mode>(a,b,c));
print_mode(Mode);
std::cout << int(1e-6*n*rep/t.best()) << "\t";
}
template<int N> struct print_n;
template<int M, int N, int K> struct loop_on_m;
template<int M, int N, int K, typename Scalar, int Mode> struct loop_on_n;
template<int M, int N, int K>
struct loop_on_k
{
static void run()
{
std::cout << "K=" << K << "\t";
print_n<N>::run();
std::cout << "\n";
loop_on_m<M,N,K>::run();
std::cout << "\n\n";
loop_on_k<M,N,K+1>::run();
}
};
template<int M, int N>
struct loop_on_k<M,N,END> { static void run(){} };
template<int M, int N, int K>
struct loop_on_m
{
static void run()
{
std::cout << M << "f\t";
loop_on_n<M,N,K,float,CoeffBasedProductMode>::run();
std::cout << "\n";
std::cout << M << "f\t";
loop_on_n<M,N,K,float,-1>::run();
std::cout << "\n";
loop_on_m<M+1,N,K>::run();
}
};
template<int N, int K>
struct loop_on_m<END,N,K> { static void run(){} };
template<int M, int N, int K, typename Scalar, int Mode>
struct loop_on_n
{
static void run()
{
bench_prod<M,N,K,Scalar,Mode==-1? alt_prod<M,N,K>::ret : Mode>();
loop_on_n<M,N+1,K,Scalar,Mode>::run();
}
};
template<int M, int K, typename Scalar, int Mode>
struct loop_on_n<M,END,K,Scalar,Mode> { static void run(){} };
template<int N> struct print_n
{
static void run()
{
std::cout << map_size<N>::ret << "\t";
print_n<N+1>::run();
}
};
template<> struct print_n<END> { static void run(){} };
int main()
{
loop_on_k<1,1,1>::run();
return 0;
}

View File

@ -2,9 +2,10 @@ project(EigenBlas)
add_custom_target(blas) add_custom_target(blas)
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp) set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp)
add_library(eigen_blas SHARED ${EigenBlas_SRCS}) add_library(eigen_blas ${EigenBlas_SRCS})
# add_library(eigen_blas SHARED ${EigenBlas_SRCS})
add_dependencies(blas eigen_blas) add_dependencies(blas eigen_blas)
install(TARGETS eigen_blas install(TARGETS eigen_blas

View File

@ -25,6 +25,8 @@
#ifndef EIGEN_BLAS_COMMON_H #ifndef EIGEN_BLAS_COMMON_H
#define EIGEN_BLAS_COMMON_H #define EIGEN_BLAS_COMMON_H
#include <iostream>
#ifndef SCALAR #ifndef SCALAR
#error the token SCALAR must be defined to compile this file #error the token SCALAR must be defined to compile this file
#endif #endif
@ -34,13 +36,12 @@ extern "C"
{ {
#endif #endif
#include <blas.h> #include "../bench/btl/libs/C_BLAS/blas.h"
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#define NOTR 0 #define NOTR 0
#define TR 1 #define TR 1
#define ADJ 2 #define ADJ 2
@ -75,27 +76,6 @@ extern "C"
#include <Eigen/Jacobi> #include <Eigen/Jacobi>
using namespace Eigen; using namespace Eigen;
template<typename T>
Block<Map<Matrix<T,Dynamic,Dynamic> >, Dynamic, Dynamic>
matrix(T* data, int rows, int cols, int stride)
{
return Map<Matrix<T,Dynamic,Dynamic> >(data, stride, cols).block(0,0,rows,cols);
}
template<typename T>
Block<Map<Matrix<T,Dynamic,Dynamic,RowMajor> >, Dynamic, 1>
vector(T* data, int size, int incr)
{
return Map<Matrix<T,Dynamic,Dynamic,RowMajor> >(data, size, incr).col(0);
}
template<typename T>
Map<Matrix<T,Dynamic,1> >
vector(T* data, int size)
{
return Map<Matrix<T,Dynamic,1> >(data, size);
}
typedef SCALAR Scalar; typedef SCALAR Scalar;
typedef NumTraits<Scalar>::Real RealScalar; typedef NumTraits<Scalar>::Real RealScalar;
typedef std::complex<RealScalar> Complex; typedef std::complex<RealScalar> Complex;
@ -106,10 +86,29 @@ enum
Conj = IsComplex Conj = IsComplex
}; };
typedef Block<Map<Matrix<Scalar,Dynamic,Dynamic> >, Dynamic, Dynamic> MatrixType; typedef Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<Dynamic> > MatrixType;
typedef Block<Map<Matrix<Scalar,Dynamic,Dynamic, RowMajor> >, Dynamic, 1> StridedVectorType; typedef Map<Matrix<Scalar,Dynamic,1>, 0, InnerStride<Dynamic> > StridedVectorType;
typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType; typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType;
template<typename T>
Map<Matrix<T,Dynamic,Dynamic>, 0, OuterStride<Dynamic> >
matrix(T* data, int rows, int cols, int stride)
{
return Map<Matrix<T,Dynamic,Dynamic>, 0, OuterStride<Dynamic> >(data, rows, cols, OuterStride<Dynamic>(stride));
}
template<typename T>
Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr)
{
return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
}
template<typename T>
Map<Matrix<T,Dynamic,1> > vector(T* data, int size)
{
return Map<Matrix<T,Dynamic,1> >(data, size);
}
#define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX,X##_) #define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX,X##_)
#endif // EIGEN_BLAS_COMMON_H #endif // EIGEN_BLAS_COMMON_H

View File

@ -23,7 +23,8 @@
// Eigen. If not, see <http://www.gnu.org/licenses/>. // Eigen. If not, see <http://www.gnu.org/licenses/>.
#define SCALAR std::complex<double> #define SCALAR std::complex<double>
#define SCALAR_SUFFIX c #define SCALAR_SUFFIX z
#define REAL_SCALAR_SUFFIX d
#define ISCOMPLEX 1 #define ISCOMPLEX 1
#include "level1_impl.h" #include "level1_impl.h"

View File

@ -23,7 +23,8 @@
// Eigen. If not, see <http://www.gnu.org/licenses/>. // Eigen. If not, see <http://www.gnu.org/licenses/>.
#define SCALAR std::complex<float> #define SCALAR std::complex<float>
#define SCALAR_SUFFIX z #define SCALAR_SUFFIX c
#define REAL_SCALAR_SUFFIX s
#define ISCOMPLEX 1 #define ISCOMPLEX 1
#include "level1_impl.h" #include "level1_impl.h"

View File

@ -30,52 +30,111 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx,
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
if(*incx==1 && *incy==1) // std::cerr << "axpy " << *n << " " << alpha << " " << *incx << " " << *incy << "\n";
vector(y,*n) += alpha * vector(x,*n);
else
vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
return 1; if(*incx==1 && *incy==1) vector(y,*n) += alpha * vector(x,*n);
else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx);
else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse();
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse();
return 0;
} }
#if !ISCOMPLEX
// computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
// res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx) RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
{ {
int size = IsComplex ? 2* *n : *n; // std::cerr << "_asum " << *n << " " << *incx << "\n";
if(*incx==1) Scalar* x = reinterpret_cast<Scalar*>(px);
return vector(px,size).cwise().abs().sum();
else
return vector(px,size,*incx).cwise().abs().sum();
return 1; if(*n<=0) return 0;
if(*incx==1) return vector(x,*n).cwiseAbs().sum();
else return vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
} }
#else
struct ei_scalar_norm1_op {
typedef RealScalar result_type;
EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_norm1_op)
inline RealScalar operator() (const Scalar& a) const { return ei_norm1(a); }
};
namespace Eigen {
template<> struct ei_functor_traits<ei_scalar_norm1_op >
{
enum { Cost = 3 * NumTraits<Scalar>::AddCost, PacketAccess = 0 };
};
}
RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, RealScalar *px, int *incx)
{
// std::cerr << "__asum " << *n << " " << *incx << "\n";
Complex* x = reinterpret_cast<Complex*>(px);
if(*n<=0) return 0;
if(*incx==1) return vector(x,*n).unaryExpr<ei_scalar_norm1_op>().sum();
else return vector(x,*n,std::abs(*incx)).unaryExpr<ei_scalar_norm1_op>().sum();
}
#endif
int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
{ {
int size = IsComplex ? 2* *n : *n; // std::cerr << "_copy " << *n << " " << *incx << " " << *incy << "\n";
if(*incx==1 && *incy==1) Scalar* x = reinterpret_cast<Scalar*>(px);
vector(py,size) = vector(px,size); Scalar* y = reinterpret_cast<Scalar*>(py);
else
vector(py,size,*incy) = vector(px,size,*incx);
return 1; if(*incx==1 && *incy==1) vector(y,*n) = vector(x,*n);
else if(*incx>0 && *incy>0) vector(y,*n,*incy) = vector(x,*n,*incx);
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() = vector(x,*n,*incx);
else if(*incx<0 && *incy>0) vector(y,*n,*incy) = vector(x,*n,-*incx).reverse();
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() = vector(x,*n,-*incx).reverse();
return 0;
} }
// computes a vector-vector dot product. // computes a vector-vector dot product.
Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
{ {
// std::cerr << "_dot " << *n << " " << *incx << " " << *incy << "\n";
if(*n<=0)
return 0;
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) if(*incx==1 && *incy==1) return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
return (vector(x,*n).cwise()*vector(y,*n)).sum(); else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
return (vector(x,*n,*incx).cwise()*vector(y,*n,*incy)).sum(); else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
else return 0;
} }
int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *incx)
{
// std::cerr << "i_amax " << *n << " " << *incx << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px);
if(*n<=0)
return 0;
int ret;
if(*incx==1) vector(x,*n).cwiseAbs().maxCoeff(&ret);
else vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
return ret+1;
}
/* /*
// computes a vector-vector dot product with extended precision. // computes a vector-vector dot product with extended precision.
@ -96,53 +155,100 @@ Scalar EIGEN_BLAS_FUNC(sdot)(int *n, RealScalar *px, int *incx, RealScalar *py,
#if ISCOMPLEX #if ISCOMPLEX
// computes a dot product of a conjugated vector with another vector. // computes a dot product of a conjugated vector with another vector.
Scalar EIGEN_BLAS_FUNC(dotc)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) void EIGEN_BLAS_FUNC(dotc)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
{ {
std::cerr << "Eigen BLAS: _dotc is not implemented yet\n";
return;
// TODO: find how to return a complex to fortran
// std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) if(*incx==1 && *incy==1)
return vector(x,*n).dot(vector(y,*n)); *reinterpret_cast<Scalar*>(dot) = vector(x,*n).dot(vector(y,*n));
else
return vector(x,*n,*incx).dot(vector(y,*n,*incy)); *reinterpret_cast<Scalar*>(dot) = vector(x,*n,*incx).dot(vector(y,*n,*incy));
} }
// computes a vector-vector dot product without complex conjugation. // computes a vector-vector dot product without complex conjugation.
Scalar EIGEN_BLAS_FUNC(dotu)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) void EIGEN_BLAS_FUNC(dotu)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
{ {
std::cerr << "Eigen BLAS: _dotu is not implemented yet\n";
return;
// TODO: find how to return a complex to fortran
// std::cerr << "_dotu " << *n << " " << *incx << " " << *incy << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
if(*incx==1 && *incy==1) if(*incx==1 && *incy==1)
return (vector(x,*n).cwise()*vector(y,*n)).sum(); *reinterpret_cast<Scalar*>(dot) = (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
else
return (vector(x,*n,*incx).cwise()*vector(y,*n,*incy)).sum(); *reinterpret_cast<Scalar*>(dot) = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
} }
#endif // ISCOMPLEX #endif // ISCOMPLEX
#if !ISCOMPLEX
// computes the Euclidean norm of a vector. // computes the Euclidean norm of a vector.
Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx) Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
{ {
// std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
if(*n<=0)
return 0;
if(*incx==1) return vector(x,*n).norm();
else return vector(x,*n,std::abs(*incx)).norm();
}
#else
RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, RealScalar *px, int *incx)
{
// std::cerr << "__nrm2 " << *n << " " << *incx << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px);
if(*n<=0)
return 0;
if(*incx==1) if(*incx==1)
return vector(x,*n).norm(); return vector(x,*n).norm();
return vector(x,*n,*incx).norm(); return vector(x,*n,*incx).norm();
} }
#endif
int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps) int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
{ {
// std::cerr << "_rot " << *n << " " << *incx << " " << *incy << "\n";
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar* y = reinterpret_cast<Scalar*>(py); Scalar* y = reinterpret_cast<Scalar*>(py);
Scalar c = *reinterpret_cast<Scalar*>(pc); Scalar c = *reinterpret_cast<Scalar*>(pc);
Scalar s = *reinterpret_cast<Scalar*>(ps); Scalar s = *reinterpret_cast<Scalar*>(ps);
StridedVectorType vx(vector(x,*n,*incx)); if(*n<=0)
StridedVectorType vy(vector(y,*n,*incy)); return 0;
ei_apply_rotation_in_the_plane(vx, vy, PlanarRotation<Scalar>(c,s));
return 1; StridedVectorType vx(vector(x,*n,std::abs(*incx)));
StridedVectorType vy(vector(y,*n,std::abs(*incy)));
Reverse<StridedVectorType> rvx(vx);
Reverse<StridedVectorType> rvy(vy);
if(*incx<0 && *incy>0) ei_apply_rotation_in_the_plane(rvx, vy, PlanarRotation<Scalar>(c,s));
else if(*incx>0 && *incy<0) ei_apply_rotation_in_the_plane(vx, rvy, PlanarRotation<Scalar>(c,s));
else ei_apply_rotation_in_the_plane(vx, vy, PlanarRotation<Scalar>(c,s));
return 0;
} }
int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps) int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps)
@ -157,7 +263,7 @@ int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealSc
*c = r.c(); *c = r.c();
*s = r.s(); *s = r.s();
return 1; return 0;
} }
#if !ISCOMPLEX #if !ISCOMPLEX
@ -183,43 +289,56 @@ int EIGEN_BLAS_FUNC(rotmg)(RealScalar *d1, RealScalar *d2, RealScalar *x1, RealS
*/ */
#endif // !ISCOMPLEX #endif // !ISCOMPLEX
int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *px, int *incx, RealScalar *palpha) int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
{ {
Scalar* x = reinterpret_cast<Scalar*>(px); Scalar* x = reinterpret_cast<Scalar*>(px);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
if(*incx==1) // std::cerr << "_scal " << *n << " " << alpha << " " << *incx << "\n";
vector(x,*n) *= alpha;
vector(x,*n,*incx) *= alpha; if(*n<=0)
return 0;
return 1; if(*incx==1) vector(x,*n) *= alpha;
else vector(x,*n,std::abs(*incx)) *= alpha;
return 0;
} }
#if ISCOMPLEX
int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
{
Scalar* x = reinterpret_cast<Scalar*>(px);
RealScalar alpha = *palpha;
// std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
if(*n<=0)
return 0;
if(*incx==1) vector(x,*n) *= alpha;
else vector(x,*n,std::abs(*incx)) *= alpha;
return 0;
}
#endif // ISCOMPLEX
int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy) int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
{ {
int size = IsComplex ? 2* *n : *n; // std::cerr << "_swap " << *n << " " << *incx << " " << *incy << "\n";
if(*incx==1 && *incy==1) Scalar* x = reinterpret_cast<Scalar*>(px);
vector(py,size).swap(vector(px,size)); Scalar* y = reinterpret_cast<Scalar*>(py);
else
vector(py,size,*incy).swap(vector(px,size,*incx)); if(*n<=0)
return 0;
if(*incx==1 && *incy==1) vector(y,*n).swap(vector(x,*n));
else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx));
else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx));
else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse());
else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse());
return 1; return 1;
} }
#if !ISCOMPLEX
RealScalar EIGEN_BLAS_FUNC(casum)(int *n, RealScalar *px, int *incx)
{
Complex* x = reinterpret_cast<Complex*>(px);
if(*incx==1)
return vector(x,*n).cwise().abs().sum();
else
return vector(x,*n,*incx).cwise().abs().sum();
return 1;
}
#endif // ISCOMPLEX

View File

@ -56,9 +56,11 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
return 1; return 1;
} }
/*
int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
{ {
return 0;
typedef void (*functype)(int, const Scalar *, int, Scalar *, int); typedef void (*functype)(int, const Scalar *, int, Scalar *, int);
functype func[16]; functype func[16];
@ -95,13 +97,14 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
return 0; return 0;
func[code](*n, a, *lda, b, *incb); func[code](*n, a, *lda, b, *incb);
return 1; return 0;
} }
*/
/*
int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb) int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
{ {
return 0;
// TODO // TODO
typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int); typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int);
@ -140,13 +143,21 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
return 0; return 0;
func[code](*n, a, *lda, b, *incb, b, *incb); func[code](*n, a, *lda, b, *incb, b, *incb);
return 1; return 0;
}
// y = alpha*A*x + beta*y
int EIGEN_BLAS_FUNC(ssymv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
{
return 0;
// TODO
} }
*/
/*
int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pc, int *ldc)
{ {
return 0;
// TODO // TODO
typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar);
functype func[2]; functype func[2];
@ -174,11 +185,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa,
func[code](*n, a, *inca, c, *ldc, alpha); func[code](*n, a, *inca, c, *ldc, alpha);
return 1; return 1;
} }
*/
/*
int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pb, int *incb, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pb, int *incb, RealScalar *pc, int *ldc)
{ {
return 0;
// TODO // TODO
typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
functype func[2]; functype func[2];
@ -207,7 +220,7 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
func[code](*n, a, *inca, b, *incb, c, *ldc, alpha); func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
return 1; return 1;
} }
*/
#if ISCOMPLEX #if ISCOMPLEX

View File

@ -26,8 +26,9 @@
int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
{ {
// std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
typedef void (*functype)(int, int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
functype func[12]; static functype func[12];
static bool init = false; static bool init = false;
if(!init) if(!init)
@ -52,21 +53,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
int code = OP(*opa) | (OP(*opb) << 2);
if(code>=12 || func[code]==0 || (*m<0) || (*n<0) || (*k<0))
{
int info = 1;
xerbla_("GEMM", &info, 4);
return 0;
}
if(beta!=Scalar(1)) if(beta!=Scalar(1))
if(beta==Scalar(0))
matrix(c, *m, *n, *ldc).setZero();
else
matrix(c, *m, *n, *ldc) *= beta; matrix(c, *m, *n, *ldc) *= beta;
int code = OP(*opa) | (OP(*opb) << 2);
if(code>=12 || func[code]==0)
return 0;
func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha); func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha);
return 1; return 0;
} }
int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb) int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
{ {
// std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int); typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int);
functype func[32]; static functype func[32];
static bool init = false; static bool init = false;
if(!init) if(!init)
@ -74,38 +83,38 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
for(int k=0; k<32; ++k) for(int k=0; k<32; ++k)
func[k] = 0; func[k] = 0;
func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0, false,ColMajor,ColMajor>::run); func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0, false,ColMajor,ColMajor>::run);
func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0, false,RowMajor,ColMajor>::run); func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0, false,RowMajor,ColMajor>::run);
func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0, Conj, RowMajor,ColMajor>::run); func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0, Conj, RowMajor,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0, false,ColMajor,ColMajor>::run); func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0, false,ColMajor,ColMajor>::run);
func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0, false,RowMajor,ColMajor>::run); func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0, false,RowMajor,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0, Conj, RowMajor,ColMajor>::run); func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0, Conj, RowMajor,ColMajor>::run);
func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0, false,ColMajor,ColMajor>::run); func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0, false,ColMajor,ColMajor>::run);
func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0, false,RowMajor,ColMajor>::run); func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0, false,RowMajor,ColMajor>::run);
func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0, Conj, RowMajor,ColMajor>::run); func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0, Conj, RowMajor,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0, false,ColMajor,ColMajor>::run); func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0, false,ColMajor,ColMajor>::run);
func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0, false,RowMajor,ColMajor>::run); func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0, false,RowMajor,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0, Conj, RowMajor,ColMajor>::run); func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0, Conj, RowMajor,ColMajor>::run);
func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run); func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run);
func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run); func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run);
func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run); func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run); func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run);
func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run); func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run); func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run); func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run);
func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run); func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run);
func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run); func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run); func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run);
func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run); func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run); func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
init = true; init = true;
} }
@ -114,14 +123,23 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
Scalar* b = reinterpret_cast<Scalar*>(pb); Scalar* b = reinterpret_cast<Scalar*>(pb);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
// TODO handle alpha
int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4); int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
if(code>=32 || func[code]==0) if(code>=32 || func[code]==0 || *m<0 || *n <0)
{
int info=1;
xerbla_("TRSM",&info,4);
return 0; return 0;
}
if(SIDE(*side)==LEFT)
func[code](*m, *n, a, *lda, b, *ldb); func[code](*m, *n, a, *lda, b, *ldb);
return 1; else
func[code](*n, *m, a, *lda, b, *ldb);
if(alpha!=Scalar(1))
matrix(b,*m,*n,*ldb) *= alpha;
return 0;
} }
@ -129,46 +147,46 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
// b = alpha*b*op(a) for side = 'R'or'r' // b = alpha*b*op(a) for side = 'R'or'r'
int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb) int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
{ {
// std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
functype func[32]; static functype func[32];
static bool init = false; static bool init = false;
if(!init) if(!init)
{ {
for(int k=0; k<32; ++k) for(int k=0; k<32; ++k)
func[k] = 0; func[k] = 0;
func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, true, ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, true, ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, true, RowMajor,false,ColMajor,false,ColMajor>::run); func[TR | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, true, RowMajor,false,ColMajor,false,ColMajor>::run);
func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, true, RowMajor,Conj, ColMajor,false,ColMajor>::run); func[ADJ | (LEFT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, false,ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, false,ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, false,ColMajor,false,RowMajor,false,ColMajor>::run); func[TR | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, false,ColMajor,false,RowMajor,false,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0, false,ColMajor,false,RowMajor,Conj, ColMajor>::run); func[ADJ | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, true, ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, true, ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, true, RowMajor,false,ColMajor,false,ColMajor>::run); func[TR | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, true, RowMajor,false,ColMajor,false,ColMajor>::run);
func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, true, RowMajor,Conj, ColMajor,false,ColMajor>::run); func[ADJ | (LEFT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, false,ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0, false,ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, false,ColMajor,false,RowMajor,false,ColMajor>::run); func[TR | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, false,ColMajor,false,RowMajor,false,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0, false,ColMajor,false,RowMajor,Conj, ColMajor>::run); func[ADJ | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0, false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, RowMajor,false,ColMajor,false,ColMajor>::run); func[TR | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, RowMajor,Conj, ColMajor,false,ColMajor>::run); func[ADJ | (LEFT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,false,ColMajor>::run); func[TR | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,Conj, ColMajor>::run); func[ADJ | (RIGHT << 2) | (UP << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, RowMajor,false,ColMajor,false,ColMajor>::run); func[TR | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, RowMajor,Conj, ColMajor,false,ColMajor>::run); func[ADJ | (LEFT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,ColMajor,false,ColMajor>::run); func[NOTR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,false,ColMajor>::run); func[TR | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,Conj, ColMajor>::run); func[ADJ | (RIGHT << 2) | (LO << 3) | (UNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
init = true; init = true;
} }
@ -178,10 +196,21 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4); int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
if(code>=32 || func[code]==0) if(code>=32 || func[code]==0 || *m<0 || *n <0)
{
int info=1;
xerbla_("TRMM",&info,4);
return 0; return 0;
}
func[code](*m, *n, a, *lda, b, *ldb, b, *ldb, alpha); // FIXME find a way to avoid this copy
Matrix<Scalar,Dynamic,Dynamic> tmp = matrix(b,*m,*n,*ldb);
matrix(b,*m,*n,*ldb).setZero();
if(SIDE(*side)==LEFT)
func[code](*m, *n, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha);
else
func[code](*n, *m, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha);
return 1; return 1;
} }
@ -189,41 +218,45 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
// c = alpha*b*a + beta*c for side = 'R'or'r // c = alpha*b*a + beta*c for side = 'R'or'r
int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
{ {
// std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n";
Scalar* a = reinterpret_cast<Scalar*>(pa); Scalar* a = reinterpret_cast<Scalar*>(pa);
Scalar* b = reinterpret_cast<Scalar*>(pb); Scalar* b = reinterpret_cast<Scalar*>(pb);
Scalar* c = reinterpret_cast<Scalar*>(pc); Scalar* c = reinterpret_cast<Scalar*>(pc);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
if(*m<0 || *n<0)
{
int info=1;
xerbla_("SYMM",&info,4);
return 0;
}
if(beta!=Scalar(1)) if(beta!=Scalar(1))
matrix(c, *m, *n, *ldc) *= beta; if(beta==Scalar(0)) matrix(c, *m, *n, *ldc).setZero();
else matrix(c, *m, *n, *ldc) *= beta;
if(SIDE(*side)==LEFT) if(SIDE(*side)==LEFT)
if(UPLO(*uplo)==UP) if(UPLO(*uplo)==UP) ei_product_selfadjoint_matrix<Scalar, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
ei_product_selfadjoint_matrix<Scalar, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); else if(UPLO(*uplo)==LO) ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
else if(UPLO(*uplo)==LO) else return 0;
ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
else
return 0;
else if(SIDE(*side)==RIGHT) else if(SIDE(*side)==RIGHT)
if(UPLO(*uplo)==UP) if(UPLO(*uplo)==UP) ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); else if(UPLO(*uplo)==LO) ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
else if(UPLO(*uplo)==LO) else return 0;
ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
else
return 0;
else else
return 0; return 0;
return 1; return 0;
} }
// c = alpha*a*a' + beta*c for op = 'N'or'n' // c = alpha*a*a' + beta*c for op = 'N'or'n'
// c = alpha*a'*a + beta*c for op = 'T'or't','C'or'c' // c = alpha*a'*a + beta*c for op = 'T'or't','C'or'c'
int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
{ {
// std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar);
functype func[8]; static functype func[8];
static bool init = false; static bool init = false;
if(!init) if(!init)
@ -231,13 +264,13 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
for(int k=0; k<8; ++k) for(int k=0; k<8; ++k)
func[k] = 0; func[k] = 0;
func[NOTR | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, UpperTriangular>::run); func[NOTR | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Upper>::run);
func[TR | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run); func[TR | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
func[ADJ | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run); func[ADJ | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
func[NOTR | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, LowerTriangular>::run); func[NOTR | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Lower>::run);
func[TR | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run); func[TR | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
func[ADJ | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run); func[ADJ | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
init = true; init = true;
} }
@ -248,14 +281,20 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
int code = OP(*op) | (UPLO(*uplo) << 2); int code = OP(*op) | (UPLO(*uplo) << 2);
if(code>=8 || func[code]==0) if(code>=8 || func[code]==0 || *n<0 || *k<0)
{
int info=1;
xerbla_("SYRK",&info,4);
return 0; return 0;
}
if(beta!=Scalar(1)) if(beta!=Scalar(1))
matrix(c, *n, *n, *ldc) *= beta; if(UPLO(*uplo)==UP) matrix(c, *n, *n, *ldc).triangularView<Upper>() *= beta;
else matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
func[code](*n, *k, a, *lda, c, *ldc, alpha); func[code](*n, *k, a, *lda, c, *ldc, alpha);
return 1;
return 0;
} }
// c = alpha*a*b' + alpha*b*a' + beta*c for op = 'N'or'n' // c = alpha*a*b' + alpha*b*a' + beta*c for op = 'N'or'n'
@ -269,6 +308,7 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
// TODO // TODO
std::cerr << "Eigen BLAS: _syr2k is not implemented yet\n";
return 0; return 0;
} }
@ -286,27 +326,30 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
// std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
if(*m<0 || *n<0)
{
return 0;
}
if(beta!=Scalar(1)) if(beta!=Scalar(1))
matrix(c, *m, *n, *ldc) *= beta; matrix(c, *m, *n, *ldc) *= beta;
if(SIDE(*side)==LEFT) if(SIDE(*side)==LEFT)
if(UPLO(*uplo)==UP) if(UPLO(*uplo)==UP) ei_product_selfadjoint_matrix<Scalar, RowMajor,true,Conj, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
ei_product_selfadjoint_matrix<Scalar, RowMajor,true,Conj, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha); else if(UPLO(*uplo)==LO) ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
else if(UPLO(*uplo)==LO) else return 0;
ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
else
return 0;
else if(SIDE(*side)==RIGHT) else if(SIDE(*side)==RIGHT)
if(UPLO(*uplo)==UP) if(UPLO(*uplo)==UP) ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,Conj, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,Conj, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha); else if(UPLO(*uplo)==LO) ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
else if(UPLO(*uplo)==LO) else return 0;
ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
else
return 0;
else else
{
return 0; return 0;
}
return 1; return 0;
} }
// c = alpha*a*conj(a') + beta*c for op = 'N'or'n' // c = alpha*a*conj(a') + beta*c for op = 'N'or'n'
@ -314,7 +357,7 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc) int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
{ {
typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar); typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar);
functype func[8]; static functype func[8];
static bool init = false; static bool init = false;
if(!init) if(!init)
@ -322,29 +365,46 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
for(int k=0; k<8; ++k) for(int k=0; k<8; ++k)
func[k] = 0; func[k] = 0;
func[NOTR | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, UpperTriangular>::run); func[NOTR | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Upper>::run);
func[ADJ | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run); func[ADJ | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
func[NOTR | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, LowerTriangular>::run); func[NOTR | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Lower>::run);
func[ADJ | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run); func[ADJ | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
init = true; init = true;
} }
Scalar* a = reinterpret_cast<Scalar*>(pa); Scalar* a = reinterpret_cast<Scalar*>(pa);
Scalar* c = reinterpret_cast<Scalar*>(pc); Scalar* c = reinterpret_cast<Scalar*>(pc);
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); RealScalar alpha = *palpha;
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); RealScalar beta = *pbeta;
// std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
if(*n<0 || *k<0)
{
return 0;
}
int code = OP(*op) | (UPLO(*uplo) << 2); int code = OP(*op) | (UPLO(*uplo) << 2);
if(code>=8 || func[code]==0) if(code>=8 || func[code]==0)
return 0; return 0;
if(beta!=Scalar(1)) if(beta!=RealScalar(1))
matrix(c, *n, *n, *ldc) *= beta; {
if(UPLO(*uplo)==UP) matrix(c, *n, *n, *ldc).triangularView<StrictlyUpper>() *= beta;
else matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
}
if(*k>0 && alpha!=RealScalar(0))
{
func[code](*n, *k, a, *lda, c, *ldc, alpha); func[code](*n, *k, a, *lda, c, *ldc, alpha);
return 1; matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
}
return 0;
} }
// c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c, for op = 'N'or'n' // c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c, for op = 'N'or'n'
@ -357,7 +417,13 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
Scalar alpha = *reinterpret_cast<Scalar*>(palpha); Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
Scalar beta = *reinterpret_cast<Scalar*>(pbeta); Scalar beta = *reinterpret_cast<Scalar*>(pbeta);
if(*n<0 || *k<0)
{
return 0;
}
// TODO // TODO
std::cerr << "Eigen BLAS: _her2k is not implemented yet\n";
return 0; return 0;
} }

17
blas/xerbla.cpp Normal file
View File

@ -0,0 +1,17 @@
#include <iostream>
#ifdef __cplusplus
extern "C"
{
#endif
int xerbla_(char * msg, int *info, int)
{
std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n";
return 0;
}
#ifdef __cplusplus
}
#endif

View File

@ -154,6 +154,12 @@ macro(ei_testing_print_summary)
message("Default order: Column-major") message("Default order: Column-major")
endif() endif()
if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
message("Explicit alignment (hence vectorization) disabled")
elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
message("Explicit vectorization disabled (alignment kept enabled)")
else()
if(EIGEN_TEST_SSE2) if(EIGEN_TEST_SSE2)
message("SSE2: ON") message("SSE2: ON")
else() else()
@ -185,17 +191,19 @@ macro(ei_testing_print_summary)
endif() endif()
if(EIGEN_TEST_ALTIVEC) if(EIGEN_TEST_ALTIVEC)
message("Altivec: Using architecture defaults") message("Altivec: ON")
else() else()
message("Altivec: Using architecture defaults") message("Altivec: Using architecture defaults")
endif() endif()
if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION) if(EIGEN_TEST_NEON)
message("Explicit vec: OFF") message("ARM NEON: ON")
else() else()
message("Explicit vec: Using architecture defaults") message("ARM NEON: Using architecture defaults")
endif() endif()
endif() # vectorization / alignment options
message("\n${EIGEN_TESTING_SUMMARY}") message("\n${EIGEN_TESTING_SUMMARY}")
# message("CXX: ${CMAKE_CXX_COMPILER}") # message("CXX: ${CMAKE_CXX_COMPILER}")
# if(CMAKE_COMPILER_IS_GNUCXX) # if(CMAKE_COMPILER_IS_GNUCXX)

View File

@ -107,8 +107,11 @@ However there are a few corner cases where these alignment settings get overridd
Two possibilities: Two possibilities:
<ul> <ul>
<li>Define EIGEN_DONT_ALIGN. That disables all 128-bit alignment code, and in particular everything vectorization-related. But do note that this in particular breaks ABI compatibility with vectorized code.</li> <li>Define EIGEN_DONT_ALIGN_STATICALLY. That disables all 128-bit static alignment code, while keeping 128-bit heap alignment. This has the effect of
<li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the 128-bit alignment code and thus preserves ABI compatibility.</li> disabling vectorization for fixed-size objects (like Matrix4d) while keeping vectorization of dynamic-size objects
(like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of 128-bit static alignment.</li>
<li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
128-bit alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
</ul> </ul>
For more information, see <a href="http://eigen.tuxfamily.org/index.php?title=FAQ#I_disabled_vectorization.2C_but_I.27m_still_getting_annoyed_about_alignment_issues.21">this FAQ</a>. For more information, see <a href="http://eigen.tuxfamily.org/index.php?title=FAQ#I_disabled_vectorization.2C_but_I.27m_still_getting_annoyed_about_alignment_issues.21">this FAQ</a>.

View File

@ -0,0 +1,5 @@
int array[24];
for(int i = 0; i < 24; ++i) array[i] = i;
cout << Map<MatrixXi, 0, Stride<Dynamic,2> >
(array, 3, 3, Stride<Dynamic,2>(8, 2))
<< endl;

View File

@ -0,0 +1,5 @@
int array[12];
for(int i = 0; i < 12; ++i) array[i] = i;
cout << Map<VectorXi, 0, InnerStride<2> >
(array, 6) // the inner stride has already been passed as template parameter
<< endl;

View File

@ -0,0 +1,5 @@
int array[12];
for(int i = 0; i < 12; ++i) array[i] = i;
cout << Map<MatrixXi, 0, OuterStride<Dynamic> >
(array, 3, 3, OuterStride<Dynamic>(4))
<< endl;

View File

@ -0,0 +1,3 @@
int array[9];
for(int i = 0; i < 9; ++i) array[i] = i;
cout << Map<Matrix3i>(array) << endl;

View File

@ -104,16 +104,18 @@ ei_add_test(cwiseop)
ei_add_test(unalignedcount) ei_add_test(unalignedcount)
ei_add_test(redux) ei_add_test(redux)
ei_add_test(visitor) ei_add_test(visitor)
ei_add_test(block)
ei_add_test(product_small) ei_add_test(product_small)
ei_add_test(product_large) ei_add_test(product_large)
ei_add_test(product_extra) ei_add_test(product_extra)
ei_add_test(diagonalmatrices) ei_add_test(diagonalmatrices)
ei_add_test(adjoint) ei_add_test(adjoint)
ei_add_test(submatrices) ei_add_test(diagonal)
ei_add_test(miscmatrices) ei_add_test(miscmatrices)
ei_add_test(commainitializer) ei_add_test(commainitializer)
ei_add_test(smallvectors) ei_add_test(smallvectors)
ei_add_test(map) ei_add_test(map)
ei_add_test(mapstride)
ei_add_test(array) ei_add_test(array)
ei_add_test(array_for_matrix) ei_add_test(array_for_matrix)
ei_add_test(array_replicate) ei_add_test(array_replicate)

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library // This file is part of Eigen, a lightweight C++ template library
// for linear algebra. // for linear algebra.
// //
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -51,16 +51,18 @@ template<typename Scalar> struct CheckMinor<Scalar,1,1>
CheckMinor(MatrixType&, int, int) {} CheckMinor(MatrixType&, int, int) {}
}; };
template<typename MatrixType> void submatrices(const MatrixType& m) template<typename MatrixType> void block(const MatrixType& m)
{ {
/* this test covers the following files: /* this test covers the following files:
Row.h Column.h Block.h Minor.h DiagonalCoeffs.h Row.h Column.h Block.h Minor.h
*/ */
typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar; typedef typename MatrixType::RealScalar RealScalar;
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType; typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType; typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType; typedef Matrix<Scalar, Dynamic, Dynamic> DynamicMatrixType;
typedef Matrix<Scalar, Dynamic, 1> DynamicVectorType;
int rows = m.rows(); int rows = m.rows();
int cols = m.cols(); int cols = m.cols();
@ -69,8 +71,6 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
m3(rows, cols), m3(rows, cols),
mzero = MatrixType::Zero(rows, cols), mzero = MatrixType::Zero(rows, cols),
ones = MatrixType::Ones(rows, cols); ones = MatrixType::Ones(rows, cols);
SquareMatrixType identity = SquareMatrixType::Identity(rows, rows),
square = SquareMatrixType::Random(rows, rows);
VectorType v1 = VectorType::Random(rows), VectorType v1 = VectorType::Random(rows),
v2 = VectorType::Random(rows), v2 = VectorType::Random(rows),
v3 = VectorType::Random(rows), v3 = VectorType::Random(rows),
@ -84,20 +84,19 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
int c2 = ei_random<int>(c1,cols-1); int c2 = ei_random<int>(c1,cols-1);
//check row() and col() //check row() and col()
VERIFY_IS_APPROX(m1.col(c1).transpose(), m1.transpose().row(c1)); VERIFY_IS_EQUAL(m1.col(c1).transpose(), m1.transpose().row(c1));
// FIXME perhaps we should re-enable that without the .eval()
VERIFY_IS_APPROX(m1.col(c1).dot(square.row(r1)), (square * m1.conjugate()).eval()(r1,c1));
//check operator(), both constant and non-constant, on row() and col() //check operator(), both constant and non-constant, on row() and col()
m1.row(r1) += s1 * m1.row(r2); m1.row(r1) += s1 * m1.row(r2);
m1.col(c1) += s1 * m1.col(c2); m1.col(c1) += s1 * m1.col(c2);
//check block() //check block()
Matrix<Scalar,Dynamic,Dynamic> b1(1,1); b1(0,0) = m1(r1,c1); Matrix<Scalar,Dynamic,Dynamic> b1(1,1); b1(0,0) = m1(r1,c1);
RowVectorType br1(m1.block(r1,0,1,cols)); RowVectorType br1(m1.block(r1,0,1,cols));
VectorType bc1(m1.block(0,c1,rows,1)); VectorType bc1(m1.block(0,c1,rows,1));
VERIFY_IS_APPROX(b1, m1.block(r1,c1,1,1)); VERIFY_IS_EQUAL(b1, m1.block(r1,c1,1,1));
VERIFY_IS_APPROX(m1.row(r1), br1); VERIFY_IS_EQUAL(m1.row(r1), br1);
VERIFY_IS_APPROX(m1.col(c1), bc1); VERIFY_IS_EQUAL(m1.col(c1), bc1);
//check operator(), both constant and non-constant, on block() //check operator(), both constant and non-constant, on block()
m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1); m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1);
m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0); m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0);
@ -105,11 +104,6 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
//check minor() //check minor()
CheckMinor<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> checkminor(m1,r1,c1); CheckMinor<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> checkminor(m1,r1,c1);
//check diagonal()
VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
m2.diagonal() = 2 * m1.diagonal();
m2.diagonal()[0] *= 3;
const int BlockRows = EIGEN_ENUM_MIN(MatrixType::RowsAtCompileTime,2); const int BlockRows = EIGEN_ENUM_MIN(MatrixType::RowsAtCompileTime,2);
const int BlockCols = EIGEN_ENUM_MIN(MatrixType::ColsAtCompileTime,5); const int BlockCols = EIGEN_ENUM_MIN(MatrixType::ColsAtCompileTime,5);
if (rows>=5 && cols>=8) if (rows>=5 && cols>=8)
@ -120,45 +114,23 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
m1.template block<BlockRows,BlockCols>(1,1)(0, 3) = m1.template block<2,5>(1,1)(1,2); m1.template block<BlockRows,BlockCols>(1,1)(0, 3) = m1.template block<2,5>(1,1)(1,2);
// check that fixed block() and block() agree // check that fixed block() and block() agree
Matrix<Scalar,Dynamic,Dynamic> b = m1.template block<BlockRows,BlockCols>(3,3); Matrix<Scalar,Dynamic,Dynamic> b = m1.template block<BlockRows,BlockCols>(3,3);
VERIFY_IS_APPROX(b, m1.block(3,3,BlockRows,BlockCols)); VERIFY_IS_EQUAL(b, m1.block(3,3,BlockRows,BlockCols));
} }
if (rows>2) if (rows>2)
{ {
// test sub vectors // test sub vectors
VERIFY_IS_APPROX(v1.template head<2>(), v1.block(0,0,2,1)); VERIFY_IS_EQUAL(v1.template head<2>(), v1.block(0,0,2,1));
VERIFY_IS_APPROX(v1.template head<2>(), v1.head(2)); VERIFY_IS_EQUAL(v1.template head<2>(), v1.head(2));
VERIFY_IS_APPROX(v1.template head<2>(), v1.segment(0,2)); VERIFY_IS_EQUAL(v1.template head<2>(), v1.segment(0,2));
VERIFY_IS_APPROX(v1.template head<2>(), v1.template segment<2>(0)); VERIFY_IS_EQUAL(v1.template head<2>(), v1.template segment<2>(0));
int i = rows-2; int i = rows-2;
VERIFY_IS_APPROX(v1.template tail<2>(), v1.block(i,0,2,1)); VERIFY_IS_EQUAL(v1.template tail<2>(), v1.block(i,0,2,1));
VERIFY_IS_APPROX(v1.template tail<2>(), v1.tail(2)); VERIFY_IS_EQUAL(v1.template tail<2>(), v1.tail(2));
VERIFY_IS_APPROX(v1.template tail<2>(), v1.segment(i,2)); VERIFY_IS_EQUAL(v1.template tail<2>(), v1.segment(i,2));
VERIFY_IS_APPROX(v1.template tail<2>(), v1.template segment<2>(i)); VERIFY_IS_EQUAL(v1.template tail<2>(), v1.template segment<2>(i));
i = ei_random(0,rows-2); i = ei_random(0,rows-2);
VERIFY_IS_APPROX(v1.segment(i,2), v1.template segment<2>(i)); VERIFY_IS_EQUAL(v1.segment(i,2), v1.template segment<2>(i));
enum {
N1 = MatrixType::RowsAtCompileTime>1 ? 1 : 0,
N2 = MatrixType::RowsAtCompileTime>2 ? -2 : 0
};
// check sub/super diagonal
m2.template diagonal<N1>() = 2 * m1.template diagonal<N1>();
m2.template diagonal<N1>()[0] *= 3;
VERIFY_IS_APPROX(m2.template diagonal<N1>()[0], static_cast<Scalar>(6) * m1.template diagonal<N1>()[0]);
m2.template diagonal<N2>() = 2 * m1.template diagonal<N2>();
m2.template diagonal<N2>()[0] *= 3;
VERIFY_IS_APPROX(m2.template diagonal<N2>()[0], static_cast<Scalar>(6) * m1.template diagonal<N2>()[0]);
m2.diagonal(N1) = 2 * m1.diagonal(N1);
m2.diagonal(N1)[0] *= 3;
VERIFY_IS_APPROX(m2.diagonal(N1)[0], static_cast<Scalar>(6) * m1.diagonal(N1)[0]);
m2.diagonal(N2) = 2 * m1.diagonal(N2);
m2.diagonal(N2)[0] *= 3;
VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
} }
// stress some basic stuffs with block matrices // stress some basic stuffs with block matrices
@ -167,6 +139,49 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
VERIFY(ei_real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows)); VERIFY(ei_real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows));
VERIFY(ei_real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols)); VERIFY(ei_real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols));
// now test some block-inside-of-block.
// expressions with direct access
VERIFY_IS_EQUAL( (m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , (m1.block(r2,c2,rows-r2,cols-c2)) );
VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , (m1.col(c1).segment(r1,r2-r1+1)) );
VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
VERIFY_IS_EQUAL( (m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
// expressions without direct access
VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
VERIFY_IS_EQUAL( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
// evaluation into plain matrices from expressions with direct access (stress MapBase)
DynamicMatrixType dm;
DynamicVectorType dv;
dm.setZero();
dm = m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2);
VERIFY_IS_EQUAL(dm, (m1.block(r2,c2,rows-r2,cols-c2)));
dm.setZero();
dv.setZero();
dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0).transpose();
dv = m1.row(r1).segment(c1,c2-c1+1);
VERIFY_IS_EQUAL(dv, dm);
dm.setZero();
dv.setZero();
dm = m1.col(c1).segment(r1,r2-r1+1);
dv = m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0);
VERIFY_IS_EQUAL(dv, dm);
dm.setZero();
dv.setZero();
dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0);
dv = m1.row(r1).segment(c1,c2-c1+1);
VERIFY_IS_EQUAL(dv, dm);
dm.setZero();
dv.setZero();
dm = m1.row(r1).segment(c1,c2-c1+1).transpose();
dv = m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0);
VERIFY_IS_EQUAL(dv, dm);
} }
@ -176,18 +191,30 @@ void compare_using_data_and_stride(const MatrixType& m)
int rows = m.rows(); int rows = m.rows();
int cols = m.cols(); int cols = m.cols();
int size = m.size(); int size = m.size();
int stride = m.stride(); int innerStride = m.innerStride();
int outerStride = m.outerStride();
int rowStride = m.rowStride();
int colStride = m.colStride();
const typename MatrixType::Scalar* data = m.data(); const typename MatrixType::Scalar* data = m.data();
for(int j=0;j<cols;++j) for(int j=0;j<cols;++j)
for(int i=0;i<rows;++i) for(int i=0;i<rows;++i)
VERIFY_IS_APPROX(m.coeff(i,j), data[(MatrixType::Flags&RowMajorBit) ? i*stride+j : j*stride + i]); VERIFY(m.coeff(i,j) == data[i*rowStride + j*colStride]);
if(!MatrixType::IsVectorAtCompileTime)
{
for(int j=0;j<cols;++j)
for(int i=0;i<rows;++i)
VERIFY(m.coeff(i,j) == data[(MatrixType::Flags&RowMajorBit)
? i*outerStride + j*innerStride
: j*outerStride + i*innerStride]);
}
if(MatrixType::IsVectorAtCompileTime) if(MatrixType::IsVectorAtCompileTime)
{ {
VERIFY_IS_APPROX(stride, int((&m.coeff(1))-(&m.coeff(0)))); VERIFY(innerStride == int((&m.coeff(1))-(&m.coeff(0))));
for (int i=0;i<size;++i) for (int i=0;i<size;++i)
VERIFY_IS_APPROX(m.coeff(i), data[i*stride]); VERIFY(m.coeff(i) == data[i*innerStride]);
} }
} }
@ -211,19 +238,21 @@ void data_and_stride(const MatrixType& m)
compare_using_data_and_stride(m1.col(c1).transpose()); compare_using_data_and_stride(m1.col(c1).transpose());
} }
void test_submatrices() void test_block()
{ {
for(int i = 0; i < g_repeat; i++) { for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( submatrices(Matrix<float, 1, 1>()) ); CALL_SUBTEST_1( block(Matrix<float, 1, 1>()) );
CALL_SUBTEST_2( submatrices(Matrix4d()) ); CALL_SUBTEST_2( block(Matrix4d()) );
CALL_SUBTEST_3( submatrices(MatrixXcf(3, 3)) ); CALL_SUBTEST_3( block(MatrixXcf(3, 3)) );
CALL_SUBTEST_4( submatrices(MatrixXi(8, 12)) ); CALL_SUBTEST_4( block(MatrixXi(8, 12)) );
CALL_SUBTEST_5( submatrices(MatrixXcd(20, 20)) ); CALL_SUBTEST_5( block(MatrixXcd(20, 20)) );
CALL_SUBTEST_6( submatrices(MatrixXf(20, 20)) ); CALL_SUBTEST_6( block(MatrixXf(20, 20)) );
CALL_SUBTEST_8( submatrices(Matrix<float,Dynamic,4>(3, 4)) ); CALL_SUBTEST_8( block(Matrix<float,Dynamic,4>(3, 4)) );
#ifndef EIGEN_DEFAULT_TO_ROW_MAJOR
CALL_SUBTEST_6( data_and_stride(MatrixXf(ei_random(5,50), ei_random(5,50))) ); CALL_SUBTEST_6( data_and_stride(MatrixXf(ei_random(5,50), ei_random(5,50))) );
CALL_SUBTEST_7( data_and_stride(Matrix<int,Dynamic,Dynamic,RowMajor>(ei_random(5,50), ei_random(5,50))) ); CALL_SUBTEST_7( data_and_stride(Matrix<int,Dynamic,Dynamic,RowMajor>(ei_random(5,50), ei_random(5,50))) );
#endif
} }
} }

View File

@ -95,7 +95,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
{ {
LLT<SquareMatrixType,Lower> chollo(symmLo); LLT<SquareMatrixType,Lower> chollo(symmLo);
VERIFY_IS_APPROX(symm, chollo.matrixL().toDenseMatrix() * chollo.matrixL().adjoint().toDenseMatrix()); VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
vecX = chollo.solve(vecB); vecX = chollo.solve(vecB);
VERIFY_IS_APPROX(symm * vecX, vecB); VERIFY_IS_APPROX(symm * vecX, vecB);
matX = chollo.solve(matB); matX = chollo.solve(matB);
@ -103,7 +103,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
// test the upper mode // test the upper mode
LLT<SquareMatrixType,Upper> cholup(symmUp); LLT<SquareMatrixType,Upper> cholup(symmUp);
VERIFY_IS_APPROX(symm, cholup.matrixL().toDenseMatrix() * cholup.matrixL().adjoint().toDenseMatrix()); VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
vecX = cholup.solve(vecB); vecX = cholup.solve(vecB);
VERIFY_IS_APPROX(symm * vecX, vecB); VERIFY_IS_APPROX(symm * vecX, vecB);
matX = cholup.solve(matB); matX = cholup.solve(matB);
@ -119,8 +119,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
{ {
LDLT<SquareMatrixType> ldlt(symm); LDLT<SquareMatrixType> ldlt(symm);
// TODO(keir): This doesn't make sense now that LDLT pivots. VERIFY_IS_APPROX(symm, ldlt.reconstructedMatrix());
//VERIFY_IS_APPROX(symm, ldlt.matrixL() * ldlt.vectorD().asDiagonal() * ldlt.matrixL().adjoint());
vecX = ldlt.solve(vecB); vecX = ldlt.solve(vecB);
VERIFY_IS_APPROX(symm * vecX, vecB); VERIFY_IS_APPROX(symm * vecX, vecB);
matX = ldlt.solve(matB); matX = ldlt.solve(matB);

81
test/diagonal.cpp Normal file
View File

@ -0,0 +1,81 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
//
// Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3 of the License, or (at your option) any later version.
//
// Alternatively, you can redistribute it and/or
// modify it under the terms of the GNU General Public License as
// published by the Free Software Foundation; either version 2 of
// the License, or (at your option) any later version.
//
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License and a copy of the GNU General Public License along with
// Eigen. If not, see <http://www.gnu.org/licenses/>.
#include "main.h"
template<typename MatrixType> void diagonal(const MatrixType& m)
{
typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar;
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
int rows = m.rows();
int cols = m.cols();
MatrixType m1 = MatrixType::Random(rows, cols),
m2 = MatrixType::Random(rows, cols);
//check diagonal()
VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
m2.diagonal() = 2 * m1.diagonal();
m2.diagonal()[0] *= 3;
if (rows>2)
{
enum {
N1 = MatrixType::RowsAtCompileTime>1 ? 1 : 0,
N2 = MatrixType::RowsAtCompileTime>2 ? -2 : 0
};
// check sub/super diagonal
m2.template diagonal<N1>() = 2 * m1.template diagonal<N1>();
m2.template diagonal<N1>()[0] *= 3;
VERIFY_IS_APPROX(m2.template diagonal<N1>()[0], static_cast<Scalar>(6) * m1.template diagonal<N1>()[0]);
m2.template diagonal<N2>() = 2 * m1.template diagonal<N2>();
m2.template diagonal<N2>()[0] *= 3;
VERIFY_IS_APPROX(m2.template diagonal<N2>()[0], static_cast<Scalar>(6) * m1.template diagonal<N2>()[0]);
m2.diagonal(N1) = 2 * m1.diagonal(N1);
m2.diagonal(N1)[0] *= 3;
VERIFY_IS_APPROX(m2.diagonal(N1)[0], static_cast<Scalar>(6) * m1.diagonal(N1)[0]);
m2.diagonal(N2) = 2 * m1.diagonal(N2);
m2.diagonal(N2)[0] *= 3;
VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
}
}
void test_diagonal()
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( diagonal(Matrix<float, 1, 1>()) );
CALL_SUBTEST_2( diagonal(Matrix4d()) );
CALL_SUBTEST_2( diagonal(MatrixXcf(3, 3)) );
CALL_SUBTEST_2( diagonal(MatrixXi(8, 12)) );
CALL_SUBTEST_2( diagonal(MatrixXcd(20, 20)) );
CALL_SUBTEST_1( diagonal(MatrixXf(21, 19)) );
CALL_SUBTEST_1( diagonal(Matrix<float,Dynamic,4>(3, 4)) );
}
}

Some files were not shown because too many files have changed in this diff Show More