From b73e22905dd1544830b41463576d112c53fc66d9 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 18 Feb 2010 20:42:38 -0500
Subject: [PATCH 001/122] miserable half-working state, commiting to a fork
 just in case, just to perfect my day, my hard disk would die. Will write a
 more detailed commit message once it's working.

---
 Eigen/Core                                    | 25 +----
 Eigen/src/Core/Assign.h                       | 75 ++++++++++-----
 Eigen/src/Core/Block.h                        | 37 ++++++--
 Eigen/src/Core/DenseBase.h                    | 62 ++++++++----
 Eigen/src/Core/DenseStorageBase.h             |  8 +-
 Eigen/src/Core/Map.h                          | 57 ++++++++---
 Eigen/src/Core/MapBase.h                      | 94 +++++++++++--------
 Eigen/src/Core/Matrix.h                       |  8 +-
 Eigen/src/Core/Product.h                      |  4 +-
 Eigen/src/Core/ReturnByValue.h                | 11 +--
 Eigen/src/Core/Transpose.h                    |  6 +-
 Eigen/src/Core/products/GeneralMatrixMatrix.h |  7 +-
 Eigen/src/Core/util/ForwardDeclarations.h     |  7 +-
 Eigen/src/Core/util/StaticAssert.h            |  7 +-
 Eigen/src/Core/util/XprHelper.h               |  4 +-
 test/submatrices.cpp                          | 29 ++++--
 16 files changed, 279 insertions(+), 162 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 18b0fafa9..b5eb91023 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -188,6 +188,7 @@ struct Dense {};
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
 #include "src/Core/MapBase.h"
+#include "src/Core/Stride.h"
 #include "src/Core/Map.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
@@ -222,28 +223,6 @@ struct Dense {};
 #include "src/Core/products/TriangularSolverMatrix.h"
 #include "src/Core/BandMatrix.h"
 
-/** \defgroup Array_Module Array module
-  * \ingroup Core_Module
-  * This module provides several handy features to manipulate matrices as simple array of values.
-  * In addition to listed classes, it defines various methods of the Cwise interface
-  * (accessible from MatrixBase::cwise()), including:
-  *  - matrix-scalar sum,
-  *  - coeff-wise comparison operators,
-  *  - sin, cos, sqrt, pow, exp, log, square, cube, inverse (reciprocal).
-  *
-  * This module also provides various MatrixBase methods, including:
-  *  - boolean reductions: \ref MatrixBase::all() "all", \ref MatrixBase::any() "any", \ref MatrixBase::count() "count",
-  *  - \ref MatrixBase::Random() "random matrix initialization",
-  *  - a \ref MatrixBase::select() "select" function mimicking the trivariate ?: operator,
-  *  - \ref MatrixBase::colwise() "column-wise" and \ref MatrixBase::rowwise() "row-wise" reductions,
-  *  - \ref MatrixBase::reverse() "matrix reverse",
-  *  - \ref MatrixBase::lpNorm() "generic matrix norm".
-  *
-  * \code
-  * #include <Eigen/Core>
-  * \endcode
-  */
-
 #include "src/Array/Functors.h"
 #include "src/Array/BooleanRedux.h"
 #include "src/Array/Select.h"
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 9440cebf1..174fd0080 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -37,19 +37,20 @@ struct ei_assign_traits
 public:
   enum {
     DstIsAligned = Derived::Flags & AlignedBit,
+    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
     SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    SrcAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
+    JointAlignment = DstIsAligned && SrcIsAligned ? Aligned : Unaligned
   };
 
 private:
   enum {
-    InnerSize = int(Derived::Flags)&RowMajorBit
-              ? Derived::ColsAtCompileTime
-              : Derived::RowsAtCompileTime,
-    InnerMaxSize = int(Derived::Flags)&RowMajorBit
-              ? Derived::MaxColsAtCompileTime
-              : Derived::MaxRowsAtCompileTime,
-    MaxSizeAtCompileTime = ei_size_at_compile_time<Derived::MaxColsAtCompileTime,Derived::MaxRowsAtCompileTime>::ret,
+    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
+              : int(Derived::RowsAtCompileTime),
+    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
+              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
+              : int(Derived::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
     PacketSize = ei_packet_traits<typename Derived::Scalar>::size
   };
 
@@ -60,11 +61,11 @@ private:
     MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
                        && int(DstIsAligned) && int(SrcIsAligned),
     MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize
-                                        && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
+    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
       /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
          so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MaySliceVectorize  = MightVectorize && DstHasDirectAccess && int(InnerMaxSize)>=3*PacketSize
       /* slice vectorization can be slow, so we only want it if the slices are big, which is
          indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
          in a fixed-size matrix */
@@ -108,12 +109,13 @@ public:
   {
     EIGEN_DEBUG_VAR(DstIsAligned)
     EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(JointAlignment)
     EIGEN_DEBUG_VAR(InnerSize)
     EIGEN_DEBUG_VAR(InnerMaxSize)
     EIGEN_DEBUG_VAR(PacketSize)
     EIGEN_DEBUG_VAR(StorageOrdersAgree)
     EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
     EIGEN_DEBUG_VAR(MayInnerVectorize)
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
@@ -211,12 +213,12 @@ struct ei_assign_innervec_CompleteUnrolling
     col = int(Derived1::Flags)&RowMajorBit
         ? Index % int(Derived1::ColsAtCompileTime)
         : Index / Derived1::RowsAtCompileTime,
-    SrcAlignment = ei_assign_traits<Derived1,Derived2>::SrcAlignment
+    JointAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment
   };
 
   EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.template copyPacket<Derived2, Aligned, SrcAlignment>(row, col, src);
+    dst.template copyPacket<Derived2, Aligned, JointAlignment>(row, col, src);
     ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
       Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
   }
@@ -265,16 +267,29 @@ struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
 {
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    const int innerSize = dst.innerSize();
-    const int outerSize = dst.outerSize();
-    for(int j = 0; j < outerSize; ++j)
-      for(int i = 0; i < innerSize; ++i)
-      {
-        if(int(Derived1::Flags)&RowMajorBit)
-          dst.copyCoeff(j, i, src);
-        else
-          dst.copyCoeff(i, j, src);
-      }
+    if(Derived1::ColsAtCompileTime == 1)
+    {
+      for(int i = 0; i < dst.rows(); ++i)
+        dst.copyCoeff(i, 0, src);
+    }
+    else if(Derived1::RowsAtCompileTime == 1)
+    {
+      for(int i = 0; i < dst.cols(); ++i)
+        dst.copyCoeff(0, i, src);
+    }
+    else
+    {
+      const int innerSize = dst.innerSize();
+      const int outerSize = dst.outerSize();
+      for(int j = 0; j < outerSize; ++j)
+        for(int i = 0; i < innerSize; ++i)
+        {
+          if(int(Derived1::Flags)&RowMajorBit)
+            dst.copyCoeff(j, i, src);
+          else
+            dst.copyCoeff(i, j, src);
+        }
+    }
   }
 };
 
@@ -418,7 +433,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling
 
     for(int index = alignedStart; index < alignedEnd; index += packetSize)
     {
-      dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::SrcAlignment>(index, src);
+      dst.template copyPacket<Derived2, Aligned, ei_assign_traits<Derived1,Derived2>::JointAlignment>(index, src);
     }
 
     ei_unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
@@ -452,7 +467,7 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling>
     const int packetAlignedMask = packetSize - 1;
     const int innerSize = dst.innerSize();
     const int outerSize = dst.outerSize();
-    const int alignedStep = (packetSize - dst.stride() % packetSize) & packetAlignedMask;
+    const int alignedStep = (packetSize - dst.outerStride() % packetSize) & packetAlignedMask;
     int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
                      : ei_first_aligned(&dst.coeffRef(0,0), innerSize);
 
@@ -504,6 +519,14 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
   EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret),
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  if(Derived::ColsAtCompileTime == 1)
+  {
+    ei_assert(OtherDerived::RowsAtCompileTime == 1 || other.cols() == 1);
+  }
+  if(Derived::RowsAtCompileTime == 1)
+  {
+    ei_assert(OtherDerived::ColsAtCompileTime == 1 || other.rows() == 1);
+  }
 #ifdef EIGEN_DEBUG_ASSIGN
   ei_assign_traits<Derived, OtherDerived>::debug();
 #endif
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 3b4234c22..8a7aea91f 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -80,6 +80,20 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _DirectAccessStatus> >
   };
 };
 
+template<typename MatrixType, int BlockRows, int BlockCols>
+struct ei_traits<Block<MatrixType, BlockRows, BlockCols, true> > : ei_traits<Block<MatrixType, BlockRows, BlockCols, false> >
+{
+  enum {
+    InnerStrideAtCompileTime =
+        (BlockRows==1 && !(int(MatrixType::Flags)&RowMajorBit))
+        || (BlockCols==1 && (int(MatrixType::Flags)&RowMajorBit))
+        ? MatrixType::OuterStrideAtCompileTime
+        : MatrixType::InnerStrideAtCompileTime,
+    OuterStrideAtCompileTime =
+        (BlockRows==1||BlockCols==1) ? 0 : MatrixType::OuterStrideAtCompileTime
+  };
+};
+
 template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessStatus> class Block
   : public MatrixType::template MakeBase< Block<MatrixType, BlockRows, BlockCols, _DirectAccessStatus> >::Type
 {
@@ -196,8 +210,8 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \sa MapBase::data() */
     inline const Scalar* data() const;
-    /** \sa MapBase::stride() */
-    inline int stride() const;
+    inline int innerStride() const;
+    inline int outerStride() const;
     #endif
 
   protected:
@@ -260,17 +274,24 @@ class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess>
              && startCol >= 0 && blockCols >= 0 && startCol + blockCols <= matrix.cols());
     }
 
-    /** \sa MapBase::stride() */
-    inline int stride() const
+    /** \sa MapBase::innerStride() */
+    inline int innerStride() const
     {
-      return    ((!Base::IsVectorAtCompileTime)
-              || (BlockRows==1 && ((Flags&RowMajorBit)==0))
-              || (BlockCols==1 && ((Flags&RowMajorBit)==RowMajorBit)))
-              ? m_matrix.stride() : 1;
+      return (RowsAtCompileTime==1 && !(int(MatrixType::Flags)&RowMajorBit))
+          || (ColsAtCompileTime==1 && (int(MatrixType::Flags)&RowMajorBit))
+          ? m_matrix.outerStride()
+          : m_matrix.innerStride();
+    }
+    
+    /** \sa MapBase::outerStride() */
+    inline int outerStride() const
+    {
+      return IsVectorAtCompileTime ? 0 : m_matrix.outerStride();
     }
 
   #ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
+  // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
   protected:
   #endif
 
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 21d792f49..d8f789ae0 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -158,31 +158,60 @@ template<typename Derived> class DenseBase
       * In other words, this function returns
       * \code rows()==1 || cols()==1 \endcode
       * \sa rows(), cols(), IsVectorAtCompileTime. */
-    inline bool isVector() const { return rows()==1 || cols()==1; }
-    /** \returns the size of the storage major dimension,
-      * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
-    int outerSize() const { return (int(Flags)&RowMajorBit) ? this->rows() : this->cols(); }
-    /** \returns the size of the inner dimension according to the storage order,
-      * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
-    int innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }
 
-    /** Only plain matrices, not expressions may be resized; therefore the only useful resize method is
-      * Matrix::resize(). The present method only asserts that the new size equals the old size, and does
+    /** \returns the outer size.
+      *
+      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
+      * with respect to the storage order, i.e., the number of columns for a column-major matrix,
+      * and the number of rows for a row-major matrix. */
+    int outerSize() const
+    {
+      return IsVectorAtCompileTime ? 1
+           : (int(Flags)&RowMajorBit) ? this->rows() : this->cols();
+    }
+
+    /** \returns the inner size.
+      *
+      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
+      * with respect to the storage order, i.e., the number of rows for a column-major matrix,
+      * and the number of columns for a row-major matrix. */
+    int innerSize() const
+    {
+      return IsVectorAtCompileTime ? this->size()
+           : (int(Flags)&RowMajorBit) ? this->cols() : this->rows();
+    }
+
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
     void resize(int size)
     {
       ei_assert(size == this->size()
-                && "MatrixBase::resize() does not actually allow to resize.");
+                && "DenseBase::resize() does not actually allow to resize.");
     }
-    /** Only plain matrices, not expressions may be resized; therefore the only useful resize method is
-      * Matrix::resize(). The present method only asserts that the new size equals the old size, and does
+    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
     void resize(int rows, int cols)
     {
       ei_assert(rows == this->rows() && cols == this->cols()
-                && "MatrixBase::resize() does not actually allow to resize.");
+                && "DenseBase::resize() does not actually allow to resize.");
+    }
+
+    int innerStride() const
+    {
+      EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
+                          THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+      return derived().innerStride();
+    }
+
+    int outerStride() const
+    {
+      EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
+                          THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+      return derived().outerStride();
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -407,13 +436,6 @@ template<typename Derived> class DenseBase
     template<typename OtherDerived>
     void swap(DenseBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other);
 
-    /** \returns number of elements to skip to pass from one row (resp. column) to another
-      * for a row-major (resp. column-major) matrix.
-      * Combined with coeffRef() and the \ref flags flags, it allows a direct access to the data
-      * of the underlying matrix.
-      */
-    inline int stride() const { return derived().stride(); }
-
     inline const NestByValue<Derived> nestByValue() const;
     inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
     inline ForceAlignedAccess<Derived> forceAlignedAccess();
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 5c8e48768..04dfb1176 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -253,13 +253,13 @@ class DenseStorageBase : public _Base<Derived>
     {
       if(RowsAtCompileTime == 1)
       {
-        ei_assert(other.isVector());
-        resize(1, other.size());
+        ei_assert(other.rows() == 1);
+        resize(1, other.cols());
       }
       else if(ColsAtCompileTime == 1)
       {
-        ei_assert(other.isVector());
-        resize(other.size(), 1);
+        ei_assert(other.cols() == 1);
+        resize(other.rows(), 1);
       }
       else resize(other.rows(), other.cols());
     }
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 83688dbca..432bf1661 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -48,32 +48,59 @@
   *
   * \sa Matrix::Map()
   */
-template<typename MatrixType, int Options>
-struct ei_traits<Map<MatrixType, Options> > : public ei_traits<MatrixType>
+template<typename MatrixType, int Options, typename StrideType>
+struct ei_traits<Map<MatrixType, Options, StrideType> >
+  : public ei_traits<MatrixType>
 {
   enum {
-    Flags = (Options&Aligned)==Aligned ? ei_traits<MatrixType>::Flags |  AlignedBit
-                                       : ei_traits<MatrixType>::Flags & ~AlignedBit
-  };
+    Flags0 = ei_traits<MatrixType>::Flags,
+    Flags1 = ((Options&Aligned)==Aligned ? Flags0 |  AlignedBit
+                                         : Flags0 & ~AlignedBit),
+    Flags = int(StrideType::InnerStrideAtCompileTime)==1 ? Flags1 : (Flags1 & ~PacketAccessBit),
+    InnerStrideAtCompileTime = int(StrideType::InnerStrideAtCompileTime) != 0 ? int(StrideType::InnerStrideAtCompileTime) : 1,
+    OuterStrideAtCompileTime =
+        int(StrideType::OuterStrideAtCompileTime != 0) ? int(StrideType::OuterStrideAtCompileTime)
+          : int(MatrixType::IsVectorAtCompileTime) ? int(MatrixType::SizeAtCompileTime)
+          : int(Flags)&RowMajorBit ? int(MatrixType::ColsAtCompileTime)
+          : int(MatrixType::RowsAtCompileTime)
+    };
 };
 
-template<typename MatrixType, int Options> class Map
-  : public MapBase<Map<MatrixType, Options>,
-                   typename MatrixType::template MakeBase< Map<MatrixType, Options> >::Type>
+template<typename MatrixType, int Options, typename StrideType> class Map
+  : public MapBase<Map<MatrixType, Options, StrideType>,
+                   typename MatrixType::template MakeBase<
+                     Map<MatrixType, Options, StrideType>
+                   >::Type>
 {
   public:
 
     typedef MapBase<Map,typename MatrixType::template MakeBase<Map>::Type> Base;
+
     EIGEN_DENSE_PUBLIC_INTERFACE(Map)
 
-    inline int stride() const { return this->innerSize(); }
+    inline int innerStride() const
+    {
+      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+    }
 
-    inline Map(const Scalar* data) : Base(data) {}
+    inline int outerStride() const
+    {
+      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+           : IsVectorAtCompileTime ? this->size()
+           : int(Flags)&RowMajorBit ? this->cols()
+           : this->rows();
+    }
 
-    inline Map(const Scalar* data, int size) : Base(data, size) {}
+    inline Map(const Scalar* data, const StrideType& stride = StrideType())
+      : Base(data), m_stride(stride) {}
 
-    inline Map(const Scalar* data, int rows, int cols) : Base(data, rows, cols) {}
+    inline Map(const Scalar* data, int size, const StrideType& stride = StrideType())
+      : Base(data, size), m_stride(stride) {}
 
+    inline Map(const Scalar* data, int rows, int cols, const StrideType& stride = StrideType())
+      : Base(data, rows, cols), m_stride(stride) {}
+
+/*
     inline void resize(int rows, int cols)
     {
       EIGEN_ONLY_USED_FOR_DEBUG(rows);
@@ -88,8 +115,12 @@ template<typename MatrixType, int Options> class Map
       EIGEN_ONLY_USED_FOR_DEBUG(size);
       ei_assert(size == this->size());
     }
+*/
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+
+  protected:
+    StrideType m_stride;
 };
 
 template<typename _Scalar, int _Rows, int _Cols, int _StorageOrder, int _MaxRows, int _MaxCols>
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index a922d8bb0..6bac2ed4c 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -37,12 +37,12 @@ template<typename Derived, typename Base> class MapBase
 {
   public:
 
-//     typedef MatrixBase<Derived> Base;
     enum {
       IsRowMajor = (int(ei_traits<Derived>::Flags) & RowMajorBit) ? 1 : 0,
       RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime,
-      SizeAtCompileTime = Base::SizeAtCompileTime
+      SizeAtCompileTime = Base::SizeAtCompileTime,
+      InnerStrideAtCompileTime = ei_traits<Derived>::InnerStrideAtCompileTime
     };
 
     typedef typename ei_traits<Derived>::Scalar Scalar;
@@ -52,90 +52,104 @@ template<typename Derived, typename Base> class MapBase
     inline int rows() const { return m_rows.value(); }
     inline int cols() const { return m_cols.value(); }
 
-    /** Returns the leading dimension (for matrices) or the increment (for vectors) to be used with data().
+    /** \returns the pointer increment between two consecutive elements.
       *
-      * More precisely:
-      *  - for a column major matrix it returns the number of elements between two successive columns
-      *  - for a row major matrix it returns the number of elements between two successive rows
-      *  - for a vector it returns the number of elements between two successive coefficients
-      * This function has to be used together with the MapBase::data() function.
+      * \note For vectors, the storage order is ignored. For matrices (non-vectors), we're looking
+      *       at the increment between two consecutive elements within a slice in the inner direction.
       *
-      * \sa MapBase::data() */
-    inline int stride() const { return derived().stride(); }
+      * \sa outerStride(), data(), rowStride(), colStride()
+      */
+    inline int innerStride() const { return derived().innerStride(); }
+
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \note For vectors, the storage order is ignored, there is only one inner slice, and so this method returns 1.
+      *       For matrices (non-vectors), the notion of inner slice depends on the storage order.
+      *
+      * \sa innerStride(), data(), rowStride(), colStride()
+      */
+    inline int outerStride() const { return derived().outerStride(); }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa data(), innerStride(), outerStride(), colStride()
+      */
+    inline int rowStride() const
+    {
+      return (RowsAtCompileTime==1 || IsRowMajor) ? outerStride() : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa data(), innerStride(), outerStride(), rowStride()
+      */
+    inline int colStride() const
+    {
+      return (RowsAtCompileTime==1 || IsRowMajor) ? innerStride() : outerStride();
+    }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
-      * This function has to be used together with the stride() function.
       *
-      * \sa MapBase::stride() */
+      * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
+      *
+      * \sa innerStride(), outerStride()
+      */
     inline const Scalar* data() const { return m_data; }
 
     inline const Scalar& coeff(int row, int col) const
     {
-      if(IsRowMajor)
-        return m_data[col + row * stride()];
-      else // column-major
-        return m_data[row + col * stride()];
+      return m_data[col * colStride() + row * rowStride()];
     }
 
     inline Scalar& coeffRef(int row, int col)
     {
-      if(IsRowMajor)
-        return const_cast<Scalar*>(m_data)[col + row * stride()];
-      else // column-major
-        return const_cast<Scalar*>(m_data)[row + col * stride()];
+      return const_cast<Scalar*>(m_data)[col * colStride() + row * rowStride()];
     }
 
     inline const Scalar& coeff(int index) const
     {
       ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
-      if ( ((RowsAtCompileTime == 1) == IsRowMajor) || !int(Derived::IsVectorAtCompileTime) )
-        return m_data[index];
-      else
-        return m_data[index*stride()];
+      return m_data[index * innerStride()];
     }
 
     inline Scalar& coeffRef(int index)
     {
       ei_assert(Derived::IsVectorAtCompileTime || (ei_traits<Derived>::Flags & LinearAccessBit));
-      if ( ((RowsAtCompileTime == 1) == IsRowMajor) || !int(Derived::IsVectorAtCompileTime) )
-        return const_cast<Scalar*>(m_data)[index];
-      else
-        return const_cast<Scalar*>(m_data)[index*stride()];
+      return const_cast<Scalar*>(m_data)[index * innerStride()];
     }
 
     template<int LoadMode>
     inline PacketScalar packet(int row, int col) const
     {
       return ei_ploadt<Scalar, LoadMode>
-               (m_data + (IsRowMajor ? col + row * stride()
-                                     : row + col * stride()));
+               (m_data + (col * colStride() + row * rowStride()));
     }
 
     template<int LoadMode>
     inline PacketScalar packet(int index) const
     {
-      return ei_ploadt<Scalar, LoadMode>(m_data + index);
+      return ei_ploadt<Scalar, LoadMode>(m_data + index * innerStride());
     }
 
     template<int StoreMode>
     inline void writePacket(int row, int col, const PacketScalar& x)
     {
       ei_pstoret<Scalar, PacketScalar, StoreMode>
-               (const_cast<Scalar*>(m_data) + (IsRowMajor ? col + row * stride()
-                                                          : row + col * stride()), x);
+               (const_cast<Scalar*>(m_data) + (col * colStride() + row * rowStride()), x);
     }
 
     template<int StoreMode>
     inline void writePacket(int index, const PacketScalar& x)
     {
       ei_pstoret<Scalar, PacketScalar, StoreMode>
-        (const_cast<Scalar*>(m_data) + index, x);
+        (const_cast<Scalar*>(m_data) + index * innerStride(), x);
     }
 
     inline MapBase(const Scalar* data) : m_data(data), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
       EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-      checkDataAlignment();
+      checkSanity();
     }
 
     inline MapBase(const Scalar* data, int size)
@@ -146,7 +160,7 @@ template<typename Derived, typename Base> class MapBase
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
       ei_assert(size >= 0);
       ei_assert(data == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == size);
-      checkDataAlignment();
+      checkSanity();
     }
 
     inline MapBase(const Scalar* data, int rows, int cols)
@@ -155,7 +169,7 @@ template<typename Derived, typename Base> class MapBase
       ei_assert( (data == 0)
               || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
                   && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
-      checkDataAlignment();
+      checkSanity();
     }
 
     Derived& operator=(const MapBase& other)
@@ -167,10 +181,12 @@ template<typename Derived, typename Base> class MapBase
 
   protected:
 
-    void checkDataAlignment() const
+    void checkSanity() const
     {
       ei_assert( ((!(ei_traits<Derived>::Flags&AlignedBit))
                   || ((size_t(m_data)&0xf)==0)) && "data is not aligned");
+      ei_assert( ((!(ei_traits<Derived>::Flags&PacketAccessBit))
+                  || (innerStride()==1)) && "packet access incompatible with inner stride greater than 1");
     }
 
     const Scalar* EIGEN_RESTRICT m_data;
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 6f194ffba..e011ae8b9 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -120,7 +120,10 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
     MaxRowsAtCompileTime = _MaxRows,
     MaxColsAtCompileTime = _MaxCols,
     Flags = ei_compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    InnerStrideAtCompileTime = 1,
+    OuterStrideAtCompileTime = (RowsAtCompileTime==1||ColsAtCompileTime==1) ? 1
+     : (int(Flags)&RowMajorBit) ? RowsAtCompileTime : ColsAtCompileTime
   };
 };
 
@@ -318,6 +321,9 @@ class Matrix
     void swap(MatrixBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
+    inline int innerStride() const { return 1; }
+    inline int outerStride() const { return this->innerSize(); }
+
     /////////// Geometry module ///////////
 
     template<typename OtherDerived>
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index af05773ee..53277169c 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -336,7 +336,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     ei_cache_friendly_product_colmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
       dest.size(),
-      &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
+      &actualLhs.const_cast_derived().coeffRef(0,0), ei_outer_stride_or_outer_size(actualLhs),
       actualRhs, actualDest, actualAlpha);
 
     if (!EvalToDest)
@@ -381,7 +381,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
 
     ei_cache_friendly_product_rowmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
-        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
+        &actualLhs.const_cast_derived().coeffRef(0,0), ei_outer_stride_or_outer_size(actualLhs),
         rhs_data, prod.rhs().size(), dest, actualAlpha);
 
     if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h
index 920269365..8d45fc31b 100644
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@@ -34,14 +34,9 @@ struct ei_traits<ReturnByValue<Derived> >
   : public ei_traits<typename ei_traits<Derived>::ReturnMatrixType>
 {
   enum {
-    // FIXME had to remove the DirectAccessBit for usage like
-    //   matrix.inverse().block(...)
-    // because the Block ctor with direct access
-    // wants to call coeffRef() to get an address, and that fails (infinite recursion) as ReturnByValue
-    // doesnt implement coeffRef().
-    // The fact that I had to do that shows that when doing xpr.block() with a non-direct-access xpr,
-    // even if xpr has the EvalBeforeNestingBit, the block() doesn't use direct access on the evaluated
-    // xpr.
+    // We're disabling the DirectAccess because e.g. the constructor of
+    // the Block-with-DirectAccess expression requires to have a coeffRef method.
+    // Also, we don't want to have to implement the stride stuff.
     Flags = (ei_traits<typename ei_traits<Derived>::ReturnMatrixType>::Flags
              | EvalBeforeNestingBit) & ~DirectAccessBit
   };
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index bd06d8464..753a67ee0 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -80,6 +80,9 @@ template<typename MatrixType> class Transpose
     typename ei_cleantype<typename MatrixType::Nested>::type&
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
+    enum { InnerStrideAtCompileTime = ei_inner_stride_at_compile_time<MatrixType>::ret,
+           OuterStrideAtCompileTime = ei_outer_stride_at_compile_time<MatrixType>::ret };
+
   protected:
     const typename MatrixType::Nested m_matrix;
 };
@@ -93,7 +96,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
     typedef typename MatrixType::template MakeBase<Transpose<MatrixType> >::Type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
 
-    inline int stride() const { return derived().nestedExpression().stride(); }
+    inline int innerStride() const { return derived().nestedExpression().innerStride(); }
+    inline int outerStride() const { return derived().nestedExpression().outerStride(); }
     inline Scalar* data() { return derived().nestedExpression().data(); }
     inline const Scalar* data() const { return derived().nestedExpression().data(); }
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index beec17ee4..03c77cc78 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -147,6 +147,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
 
       const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
       const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
+      ei_assert(ei_inner_stride_at_compile_time<ActualLhsType>::ret == 1);
 
       Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
                                  * RhsBlasTraits::extractScalarFactor(m_rhs);
@@ -158,9 +159,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>
       ::run(
           this->rows(), this->cols(), lhs.cols(),
-          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-          (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
+          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), ei_outer_stride_or_outer_size(lhs),
+          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), ei_outer_stride_or_outer_size(rhs),
+          (Scalar*)&(dst.coeffRef(0,0)), ei_outer_stride_or_outer_size(dst),
           actualAlpha);
     }
 };
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index aed0abe6d..c2d45dc30 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -1,7 +1,8 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2007-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -60,7 +61,9 @@ template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeA
 template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
 template<typename MatrixType, int Index> class Diagonal;
 
-template<typename MatrixType, int Options=Unaligned> class Map;
+template<int InnerStrideAtCompileTime = Dynamic, int OuterStrideAtCompileTime = Dynamic> class Stride;
+template<typename MatrixType, int Options=Unaligned, typename StrideType = Stride<0,0> > class Map;
+
 template<typename Derived> class TriangularBase;
 template<typename MatrixType, unsigned int Mode> class TriangularView;
 template<typename MatrixType, unsigned int Mode> class SelfAdjointView;
diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 619e7664d..5252b28c5 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -46,7 +46,7 @@
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
 
-  #else // CXX0X
+  #else // not CXX0X
 
     template<bool condition>
     struct ei_static_assert {};
@@ -81,7 +81,8 @@
         BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
         THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
         THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES
+        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
+        YOU_ALREADY_SPECIFIED_THIS_STRIDE
       };
     };
 
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a86e7be89..8ddf4450a 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -50,7 +50,7 @@ template<int Value> class ei_int_if_dynamic
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(ei_int_if_dynamic)
-    explicit ei_int_if_dynamic(int) {}
+    explicit ei_int_if_dynamic(int v) { EIGEN_ONLY_USED_FOR_DEBUG(v); ei_assert(v == Value); }
     static int value() { return Value; }
     void setValue(int) {}
 };
@@ -58,7 +58,7 @@ template<int Value> class ei_int_if_dynamic
 template<> class ei_int_if_dynamic<Dynamic>
 {
     int m_value;
-    ei_int_if_dynamic() {}
+    ei_int_if_dynamic() { ei_assert(false); }
   public:
     explicit ei_int_if_dynamic(int value) : m_value(value) {}
     int value() const { return m_value; }
diff --git a/test/submatrices.cpp b/test/submatrices.cpp
index d53fd4b6f..a9dcf8476 100644
--- a/test/submatrices.cpp
+++ b/test/submatrices.cpp
@@ -93,6 +93,7 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
 
   //check block()
   Matrix<Scalar,Dynamic,Dynamic> b1(1,1); b1(0,0) = m1(r1,c1);
+
   RowVectorType br1(m1.block(r1,0,1,cols));
   VectorType bc1(m1.block(0,c1,rows,1));
   VERIFY_IS_APPROX(b1, m1.block(r1,c1,1,1));
@@ -176,18 +177,30 @@ void compare_using_data_and_stride(const MatrixType& m)
   int rows = m.rows();
   int cols = m.cols();
   int size = m.size();
-  int stride = m.stride();
+  int innerStride = m.innerStride();
+  int outerStride = m.outerStride();
+  int rowStride = m.rowStride();
+  int colStride = m.colStride();
   const typename MatrixType::Scalar* data = m.data();
 
   for(int j=0;j<cols;++j)
     for(int i=0;i<rows;++i)
-      VERIFY_IS_APPROX(m.coeff(i,j), data[(MatrixType::Flags&RowMajorBit) ? i*stride+j : j*stride + i]);
+      VERIFY_IS_APPROX(m.coeff(i,j), data[i*rowStride + j*colStride]);
+
+  if(!MatrixType::IsVectorAtCompileTime)
+  {
+    for(int j=0;j<cols;++j)
+      for(int i=0;i<rows;++i)
+        VERIFY_IS_APPROX(m.coeff(i,j), data[(MatrixType::Flags&RowMajorBit)
+                                            ? i*outerStride + j*innerStride
+                                            : j*outerStride + i*innerStride]);
+  }
 
   if(MatrixType::IsVectorAtCompileTime)
   {
-    VERIFY_IS_APPROX(stride, int((&m.coeff(1))-(&m.coeff(0))));
+    VERIFY_IS_APPROX(innerStride, int((&m.coeff(1))-(&m.coeff(0))));
     for (int i=0;i<size;++i)
-      VERIFY_IS_APPROX(m.coeff(i), data[i*stride]);
+      VERIFY_IS_APPROX(m.coeff(i), data[i*innerStride]);
   }
 }
 
@@ -204,11 +217,11 @@ void data_and_stride(const MatrixType& m)
 
   MatrixType m1 = MatrixType::Random(rows, cols);
   compare_using_data_and_stride(m1.block(r1, c1, r2-r1+1, c2-c1+1));
-  compare_using_data_and_stride(m1.transpose().block(c1, r1, c2-c1+1, r2-r1+1));
+  //compare_using_data_and_stride(m1.transpose().block(c1, r1, c2-c1+1, r2-r1+1));
   compare_using_data_and_stride(m1.row(r1));
   compare_using_data_and_stride(m1.col(c1));
-  compare_using_data_and_stride(m1.row(r1).transpose());
-  compare_using_data_and_stride(m1.col(c1).transpose());
+  //compare_using_data_and_stride(m1.row(r1).transpose());
+  //compare_using_data_and_stride(m1.col(c1).transpose());
 }
 
 void test_submatrices()
@@ -223,7 +236,9 @@ void test_submatrices()
 
     CALL_SUBTEST_8( submatrices(Matrix<float,Dynamic,4>(3, 4)) );
 
+#ifndef EIGEN_DEFAULT_TO_ROW_MAJOR
     CALL_SUBTEST_6( data_and_stride(MatrixXf(ei_random(5,50), ei_random(5,50))) );
     CALL_SUBTEST_7( data_and_stride(Matrix<int,Dynamic,Dynamic,RowMajor>(ei_random(5,50), ei_random(5,50))) );
+#endif
   }
 }

From 5491531a8119223322110d18056a5a94bbbe413e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 18 Feb 2010 20:44:17 -0500
Subject: [PATCH 002/122] add Stride.h

---
 Eigen/src/Core/Stride.h | 136 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 Eigen/src/Core/Stride.h

diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
new file mode 100644
index 000000000..ba0b19de3
--- /dev/null
+++ b/Eigen/src/Core/Stride.h
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_STRIDE_H
+#define EIGEN_STRIDE_H
+
+template<int _InnerStrideAtCompileTime, int _OuterStrideAtCompileTime>
+class Stride
+{
+  public:
+
+    enum {
+      InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
+      OuterStrideAtCompileTime = _OuterStrideAtCompileTime
+    };
+
+    Stride()
+      : m_inner(InnerStrideAtCompileTime), m_outer(OuterStrideAtCompileTime)
+    {
+      ei_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
+    }
+
+    Stride(int innerStride, int outerStride)
+      : m_inner(innerStride), m_outer(outerStride)
+    {
+      ei_assert(innerStride>=0 && outerStride>=0);
+    }
+
+    Stride(const Stride& other)
+      : m_inner(other.inner()), m_outer(other.outer())
+    {}
+
+    inline int inner() const { return m_inner.value(); }
+    inline int outer() const { return m_outer.value(); }
+
+    template<int OtherInnerStrideAtCompileTime, int OtherOuterStrideAtCompileTime>
+    Stride<EIGEN_ENUM_MAX(InnerStrideAtCompileTime, OtherInnerStrideAtCompileTime),
+           EIGEN_ENUM_MAX(OuterStrideAtCompileTime, OtherOuterStrideAtCompileTime)>
+    operator|(const Stride<OtherInnerStrideAtCompileTime, OtherOuterStrideAtCompileTime>& other)
+    {
+      EIGEN_STATIC_ASSERT(!((InnerStrideAtCompileTime && OtherInnerStrideAtCompileTime)
+                         || (OuterStrideAtCompileTime && OtherOuterStrideAtCompileTime)),
+                          YOU_ALREADY_SPECIFIED_THIS_STRIDE)
+      int result_inner = InnerStrideAtCompileTime ? inner() : other.inner();
+      int result_outer = OuterStrideAtCompileTime ? outer() : other.outer();
+      return Stride<EIGEN_ENUM_MAX(InnerStrideAtCompileTime, OtherInnerStrideAtCompileTime),
+                    EIGEN_ENUM_MAX(OuterStrideAtCompileTime, OtherOuterStrideAtCompileTime)>
+                    (result_inner, result_outer);
+    }
+  protected:
+    ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
+    ei_int_if_dynamic<OuterStrideAtCompileTime> m_outer;
+};
+
+template<int Value = Dynamic>
+class InnerStride : public Stride<Value, 0>
+{
+    typedef Stride<Value,0> Base;
+  public:
+    InnerStride() : Base() {}
+    InnerStride(int v) : Base(v,0) {}
+};
+
+template<int Value = Dynamic>
+class OuterStride : public Stride<0, Value>
+{
+    typedef Stride<0,Value> Base;
+  public:
+    OuterStride() : Base() {}
+    OuterStride(int v) : Base(0,v) {}
+};
+
+template<typename T, bool HasDirectAccess = int(ei_traits<T>::Flags)&DirectAccessBit>
+struct ei_outer_stride_or_outer_size_impl
+{
+  static inline int value(const T& x) { return x.outerStride(); }
+};
+
+template<typename T>
+struct ei_outer_stride_or_outer_size_impl<T, false>
+{
+  static inline int value(const T& x) { return x.outerSize(); }
+};
+
+template<typename T>
+inline int ei_outer_stride_or_outer_size(const T& x)
+{
+  return ei_outer_stride_or_outer_size_impl<T>::value(x);
+}
+
+template<typename T, bool HasDirectAccess = int(ei_traits<typename ei_cleantype<T>::type>::Flags)&DirectAccessBit>
+struct ei_inner_stride_at_compile_time
+{
+  enum { ret = ei_traits<typename ei_cleantype<T>::type>::InnerStrideAtCompileTime };
+};
+
+template<typename T>
+struct ei_inner_stride_at_compile_time<T, false>
+{
+  enum { ret = 1 };
+};
+
+template<typename T, bool HasDirectAccess = int(ei_traits<typename ei_cleantype<T>::type>::Flags)&DirectAccessBit>
+struct ei_outer_stride_at_compile_time
+{
+  enum { ret = ei_traits<typename ei_cleantype<T>::type>::OuterStrideAtCompileTime };
+};
+
+template<typename T>
+struct ei_outer_stride_at_compile_time<T, false>
+{
+  enum { ret = 1 };
+};
+
+#endif // EIGEN_STRIDE_H

From 437f40acc1cbd9ce2f2a2a3f413cae3a5b35f8fb Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 09:32:16 +0100
Subject: [PATCH 003/122] fix BTL's eigen interface

---
 bench/btl/libs/eigen2/eigen2_interface.hh | 27 ++++++++++-------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 1166a37a1..a8b5b884f 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -17,11 +17,8 @@
 //
 #ifndef EIGEN2_INTERFACE_HH
 #define EIGEN2_INTERFACE_HH
-// #include <cblas.h>
-#include <Eigen/Array>
-#include <Eigen/Cholesky>
-#include <Eigen/LU>
-#include <Eigen/QR>
+
+#include <Eigen/Eigen>
 #include <vector>
 #include "btl.hh"
 
@@ -88,27 +85,27 @@ public :
   }
 
   static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
-    X = (A*B).lazy();
+    X.noalias() = A*B;
   }
 
   static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
-    X = (A.transpose()*B.transpose()).lazy();
+    X.noalias() = A.transpose()*B.transpose();
   }
 
   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A.transpose()*A).lazy();
+    X.noalias() = A.transpose()*A;
   }
 
   static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A*A.transpose()).lazy();
+    X.noalias() = A*A.transpose();
   }
 
   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
-    X = (A*B).lazy();
+    X.noalias() = A*B;
   }
 
   static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
-    X = (A.template selfadjointView<LowerTriangular>() * B)/*.lazy()*/;
+    X.noalias() = (A.template selfadjointView<Lower>() * B);
 //     ei_product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
   }
 
@@ -173,7 +170,7 @@ public :
   }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
-    X = (A.transpose()*B).lazy();
+    X.noalias() = (A.transpose()*B);
   }
 
   static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
@@ -193,16 +190,16 @@ public :
   }
 
   static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
-    X = L.template triangularView<LowerTriangular>().solve(B);
+    X = L.template triangularView<Lower>().solve(B);
   }
 
   static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
-    X = L.template triangularView<LowerTriangular>().solve(B);
+    X = L.template triangularView<Lower>().solve(B);
   }
 
   static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
     C = X;
-    ei_llt_inplace<LowerTriangular>::blocked(C);
+    ei_llt_inplace<Lower>::blocked(C);
     //C = X.llt().matrixL();
 //     C = X;
 //     Cholesky<gene_matrix>::computeInPlace(C);

From b20935be9b41ece3b022eaea14fb5eac92bbaea0 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 09:40:34 +0100
Subject: [PATCH 004/122] add initial openmp support for matrix-matrix products
 => x1.9 speedup on my core2 duo

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 76 ++++++++++++++++---
 1 file changed, 65 insertions(+), 11 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index beec17ee4..7f449ac23 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -128,6 +128,49 @@ struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
  : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
 {};
 
+template<bool Prallelize,typename Functor>
+void ei_multithreaded_product(const Functor& func, int size)
+{
+  if(!Prallelize)
+    return func(0,size);
+  #ifdef OMP
+  int threads = omp_get_num_procs();
+  #else
+  int threads = 1;
+  #endif
+  int blockSize = size / threads;
+  #pragma omp parallel for schedule(static,1)
+  for(int i=0; i<threads; ++i)
+  {
+    int blockStart = i*blockSize;
+    int actualBlockSize = std::min(blockSize, size - blockStart);
+
+    func(blockStart, actualBlockSize);
+  }
+}
+
+template<typename Scalar, typename Gemm, typename Lhs, typename Rhs, typename Dest> struct ei_gemm_callback
+{
+  ei_gemm_callback(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha)
+    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
+  {}
+  
+  void operator() (int start, int size) const
+  {
+    Gemm::run(m_lhs.rows(), size, m_lhs.cols(),
+              (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(0,0)), m_lhs.stride(),
+              (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,start)), m_rhs.stride(),
+              (Scalar*)&(m_dest.coeffRef(0,start)), m_dest.stride(),
+              m_actualAlpha);
+  }
+
+  protected:
+    const Lhs& m_lhs;
+    const Rhs& m_rhs;
+    mutable Dest& m_dest;
+    Scalar m_actualAlpha;
+};
+
 template<typename Lhs, typename Rhs>
 class GeneralProduct<Lhs, Rhs, GemmProduct>
   : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
@@ -151,17 +194,28 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
       Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
                                  * RhsBlasTraits::extractScalarFactor(m_rhs);
 
-      ei_general_matrix_matrix_product<
-        Scalar,
-        (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-        (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(
-          this->rows(), this->cols(), lhs.cols(),
-          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-          (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
-          actualAlpha);
+      typedef ei_gemm_callback<Scalar,ei_general_matrix_matrix_product<
+          Scalar,
+          (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+          (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+          _ActualLhsType, _ActualRhsType, Dest> Functor;
+          
+      #ifdef OMP
+      ei_multithreaded_product<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols());
+      #else
+        ei_general_matrix_matrix_product<
+          Scalar,
+          (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+          (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>
+        ::run(
+            this->rows(), this->cols(), lhs.cols(),
+            (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
+            (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
+            (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
+            actualAlpha);
+      #endif
     }
 };
 

From 3e62fafce8d9c11401e0fb6ebe5cd8bf5ef91eb6 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 11:08:37 +0100
Subject: [PATCH 005/122] clean a bit the parallelizer

---
 Eigen/Core                                    |  9 ++++
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 54 +++++--------------
 Eigen/src/Core/products/Parallelizer.h        | 50 +++++++++++++++++
 3 files changed, 71 insertions(+), 42 deletions(-)
 create mode 100644 Eigen/src/Core/products/Parallelizer.h

diff --git a/Eigen/Core b/Eigen/Core
index cbca16640..26195cd35 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -89,6 +89,14 @@
   #endif
 #endif
 
+#ifdef _OPENMP
+  #define EIGEN_HAS_OPENMP
+#endif
+
+#ifdef EIGEN_HAS_OPENMP
+#include <omp.h>
+#endif
+
 #include <cstdlib>
 #include <cmath>
 #include <complex>
@@ -209,6 +217,7 @@ struct Dense {};
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/SolveTriangular.h"
+#include "src/Core/products/Parallelizer.h"
 #include "src/Core/products/CoeffBasedProduct.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/GeneralMatrixVector.h"
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 7f449ac23..c13e09eac 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -128,33 +128,13 @@ struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
  : ei_traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
 {};
 
-template<bool Prallelize,typename Functor>
-void ei_multithreaded_product(const Functor& func, int size)
+template<typename Scalar, typename Gemm, typename Lhs, typename Rhs, typename Dest>
+struct ei_gemm_functor
 {
-  if(!Prallelize)
-    return func(0,size);
-  #ifdef OMP
-  int threads = omp_get_num_procs();
-  #else
-  int threads = 1;
-  #endif
-  int blockSize = size / threads;
-  #pragma omp parallel for schedule(static,1)
-  for(int i=0; i<threads; ++i)
-  {
-    int blockStart = i*blockSize;
-    int actualBlockSize = std::min(blockSize, size - blockStart);
-
-    func(blockStart, actualBlockSize);
-  }
-}
-
-template<typename Scalar, typename Gemm, typename Lhs, typename Rhs, typename Dest> struct ei_gemm_callback
-{
-  ei_gemm_callback(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha)
+  ei_gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha)
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
   {}
-  
+
   void operator() (int start, int size) const
   {
     Gemm::run(m_lhs.rows(), size, m_lhs.cols(),
@@ -194,28 +174,18 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
       Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
                                  * RhsBlasTraits::extractScalarFactor(m_rhs);
 
-      typedef ei_gemm_callback<Scalar,ei_general_matrix_matrix_product<
-          Scalar,
-          (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-          (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
-          _ActualLhsType, _ActualRhsType, Dest> Functor;
-          
-      #ifdef OMP
-      ei_multithreaded_product<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols());
-      #else
+      typedef ei_gemm_functor<
+        Scalar,
         ei_general_matrix_matrix_product<
           Scalar,
           (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
           (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>
-        ::run(
-            this->rows(), this->cols(), lhs.cols(),
-            (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
-            (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
-            (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
-            actualAlpha);
-      #endif
+          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+        _ActualLhsType,
+        _ActualRhsType,
+        Dest> Functor;
+
+      ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
new file mode 100644
index 000000000..d555508b2
--- /dev/null
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -0,0 +1,50 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_PARALLELIZER_H
+#define EIGEN_PARALLELIZER_H
+
+template<bool Parallelize,typename Functor>
+void ei_run_parallel_1d(const Functor& func, int size)
+{
+#ifndef EIGEN_HAS_OPENMP
+  func(0,size);
+#else
+  if(!Parallelize)
+    return func(0,size);
+
+  int threads = omp_get_num_procs();
+  int blockSize = size / threads;
+  #pragma omp parallel for schedule(static,1)
+  for(int i=0; i<threads; ++i)
+  {
+    int blockStart = i*blockSize;
+    int actualBlockSize = std::min(blockSize, size - blockStart);
+
+    func(blockStart, actualBlockSize);
+  }
+#endif
+}
+
+#endif // EIGEN_GENERAL_MATRIX_MATRIX_H

From aaaf855a88715ae7b8426c1c09f56057ddac0b81 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 11:09:57 +0100
Subject: [PATCH 006/122] add a small benchmark to quickly bench/compare SMP
 support

---
 bench/bench_gemm.cpp      | 43 +++++++++++++++++++++++++
 bench/bench_gemm_blas.cpp | 68 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 bench/bench_gemm.cpp
 create mode 100644 bench/bench_gemm_blas.cpp

diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
new file mode 100644
index 000000000..d110a0fcd
--- /dev/null
+++ b/bench/bench_gemm.cpp
@@ -0,0 +1,43 @@
+
+// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
+// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
+
+#include <Eigen/Core>
+#include "../../eigen2/bench/BenchTimer.h"
+
+using namespace std;
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+typedef Matrix<Scalar,Dynamic,Dynamic> M;
+
+void gemm(const M& a, const M& b, M& c)
+{
+  c.noalias() += a * b;
+}
+
+int main(int argc, char ** argv)
+{
+  int rep = 2;
+  int s = 1024;
+  int m = s;
+  int n = s;
+  int p = s;
+  M a(m,n); a.setOnes();
+  M b(n,p); b.setOnes();
+  M c(m,p); c.setOnes();
+
+  BenchTimer t;
+
+  BENCH(t, 5, rep, gemm(a,b,c));
+
+  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+
+  return 0;
+}
+
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
new file mode 100644
index 000000000..45e02e59f
--- /dev/null
+++ b/bench/bench_gemm_blas.cpp
@@ -0,0 +1,68 @@
+
+#include <Eigen/Core>
+#include <../eigen2/bench/BenchTimer.h>
+
+extern "C"
+{
+  #include <bench/btl/libs/C_BLAS/blas.h>
+  #include <cblas.h>
+}
+
+using namespace std;
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+typedef Matrix<Scalar,Dynamic,Dynamic> M;
+
+static float fone = 1;
+static float fzero = 0;
+static double done = 1;
+static double szero = 0;
+static char notrans = 'N';
+static char trans = 'T';
+static char nonunit = 'N';
+static char lower = 'L';
+static char right = 'R';
+static int intone = 1;
+
+void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
+{
+//   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, c.rows(), c.cols(), a.cols(), 1, a.data(), a.rows(), b.data(), b.rows(), 1, c.data(), c.rows());
+  int M = c.rows();
+  int N = c.cols();
+  int K = a.cols();
+
+  int lda = a.rows();
+  int ldb = b.rows();
+  int ldc = c.rows();
+
+  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+         const_cast<float*>(a.data()),&lda,
+         const_cast<float*>(b.data()),&ldb,&fzero,
+         c.data(),&ldc);
+}
+
+int main(int argc, char **argv)
+{
+  int rep = 2;
+  int s = 1024;
+  int m = s;
+  int n = s;
+  int p = s;
+  M a(m,n); a.setOnes();
+  M b(n,p); b.setOnes();
+  M c(m,p); c.setOnes();
+
+  BenchTimer t;
+
+  BENCH(t, 5, rep, blas_gemm(a,b,c));
+
+  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  return 0;
+}
+

From 6730fd9f3f83178409d5afbe548a83446d6560f2 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Mon, 22 Feb 2010 11:42:58 +0100
Subject: [PATCH 007/122] Port BenchTimer fix.

---
 bench/BenchTimer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index a32495d60..e49afa07f 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -128,7 +128,7 @@ public:
 #ifdef WIN32
 	SYSTEMTIME st;
 	GetSystemTime(&st);
-	return (double)st.wSecond + 1.e-6 * (double)st.wMilliseconds;
+	return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
 #else
     struct timeval tv;
     struct timezone tz;

From e00f1fd125f0cb9939ddb3d2397caef70173dc29 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 15:18:29 +0100
Subject: [PATCH 008/122] implement an even lower level version of the gebp
 kernel for MSVC (it seems to be faster with gcc as well)

---
 .../Core/products/GeneralBlockPanelKernel.h   | 72 ++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index fe1987bdd..8c29d2218 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -76,6 +76,7 @@ struct ei_gebp_kernel
         {
           PacketType B0, B1, B2, B3, A0, A1;
 
+          #if 0
                     A0 = ei_pload(&blA[0*PacketSize]);
                     A1 = ei_pload(&blA[1*PacketSize]);
                     B0 = ei_pload(&blB[0*PacketSize]);
@@ -134,6 +135,73 @@ struct ei_gebp_kernel
           if(nr==4) C3 = cj.pmadd(A0, B3, C3);
           if(nr==4) C7 = cj.pmadd(A1, B3, C7);
 
+          #else
+
+          PacketType T0, T1;
+
+          #define MADD(A,B,C,T) { T = A; T = ei_pmul(T,B); C = ei_padd(C,T); }
+
+                    A0 = ei_pload(&blA[0*PacketSize]);
+                    A1 = ei_pload(&blA[1*PacketSize]);
+                    B0 = ei_pload(&blB[0*PacketSize]);
+                    B1 = ei_pload(&blB[1*PacketSize]);
+
+                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
+          if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
+                      MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
+          if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+                      B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
+                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
+          if(nr==4) { MADD(A0,B3,C3,T0); }// C3 = cj.pmadd(A0, B3, C3);
+                      A0 = ei_pload(&blA[2*PacketSize]);
+          if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
+                      A1 = ei_pload(&blA[3*PacketSize]);
+          if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
+                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
+                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
+                      B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
+                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
+          if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
+                      A0 = ei_pload(&blA[4*PacketSize]);
+          if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
+                      A1 = ei_pload(&blA[5*PacketSize]);
+          if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
+
+                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
+                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
+                      B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
+                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
+                      B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
+          if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
+                      A0 = ei_pload(&blA[6*PacketSize]);
+          if(nr==4) { MADD(A1,B3,C7,T1); } // C7 = cj.pmadd(A1, B3, C7);
+                      A1 = ei_pload(&blA[7*PacketSize]);
+          if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
+                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
+                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
+                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
+                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
+          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
+          if(nr==4) { MADD(A1,B2,C6,T1); }//C6 = cj.pmadd(A1, B2, C6);
+          if(nr==4) { MADD(A0,B3,C3,T0); }//C3 = cj.pmadd(A0, B3, C3);
+          if(nr==4) { MADD(A1,B3,C7,T1); }//C7 = cj.pmadd(A1, B3, C7);
+
+          #endif
+
           blB += 4*nr*PacketSize;
           blA += 4*mr;
         }
@@ -334,7 +402,7 @@ struct ei_gebp_kernel
         #endif
 
         PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-        
+
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
         for(int k=0; k<depth; k++)
         {
@@ -474,7 +542,7 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
       // skip what we have after
       if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
     }
-    
+
     // copy the remaining columns one at a time (nr==1)
     for(int j2=packet_cols; j2<cols; ++j2)
     {

From fc4a85ecd5042c54c3db15c96ce7c4832bd18738 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 16:35:05 +0100
Subject: [PATCH 009/122] fully adapt the gebp kernel and optimize it for CPU
 with only 8 registers

---
 .../Core/products/GeneralBlockPanelKernel.h   | 456 ++++++++++--------
 1 file changed, 262 insertions(+), 194 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 8c29d2218..18e913b0e 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -27,6 +27,12 @@
 
 #ifndef EIGEN_EXTERN_INSTANTIATIONS
 
+#ifdef EIGEN_HAS_FUSE_CJMADD
+#define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
+#else
+#define CJMADD(A,B,C,T)  T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
+#endif
+
 // optimized GEneral packed Block * packed Panel product kernel
 template<typename Scalar, int mr, int nr, typename Conj>
 struct ei_gebp_kernel
@@ -74,133 +80,111 @@ struct ei_gebp_kernel
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         for(int k=0; k<peeled_kc; k+=4)
         {
-          PacketType B0, B1, B2, B3, A0, A1;
+          if(nr==2)
+          {
+            PacketType B0, T0, A0, A1;
 
-          #if 0
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    A1 = ei_pload(&blA[1*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-                    C4 = cj.pmadd(A1, B0, C4);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[2*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[3*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[4*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[5*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            A1 = ei_pload(&blA[1*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,T0);
+            B0 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+            CJMADD(A1,B0,C5,T0);
 
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-                    B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[6*PacketSize]);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
-                    A1 = ei_pload(&blA[7*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C4 = cj.pmadd(A1, B0, C4);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+            A0 = ei_pload(&blA[2*PacketSize]);
+            A1 = ei_pload(&blA[3*PacketSize]);
+            B0 = ei_pload(&blB[2*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,T0);
+            B0 = ei_pload(&blB[3*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+            CJMADD(A1,B0,C5,T0);
 
-          #else
+            A0 = ei_pload(&blA[4*PacketSize]);
+            A1 = ei_pload(&blA[5*PacketSize]);
+            B0 = ei_pload(&blB[4*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,T0);
+            B0 = ei_pload(&blB[5*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+            CJMADD(A1,B0,C5,T0);
 
-          PacketType T0, T1;
+            A0 = ei_pload(&blA[6*PacketSize]);
+            A1 = ei_pload(&blA[7*PacketSize]);
+            B0 = ei_pload(&blB[6*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,T0);
+            B0 = ei_pload(&blB[7*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+            CJMADD(A1,B0,C5,T0);
+          }
+          else
+          {
 
-          #define MADD(A,B,C,T) { T = A; T = ei_pmul(T,B); C = ei_padd(C,T); }
+            PacketType B0, B1, B2, B3, A0, A1;
+            PacketType T0, T1;
 
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    A1 = ei_pload(&blA[1*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
+                        A0 = ei_pload(&blA[0*PacketSize]);
+                        A1 = ei_pload(&blA[1*PacketSize]);
+                        B0 = ei_pload(&blB[0*PacketSize]);
+                        B1 = ei_pload(&blB[1*PacketSize]);
 
-                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
-                      MADD(A1,B0,C4,T1); // C4 = cj.pmadd(A1, B0, C4);
-          if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
-                      B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
-                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
-                      B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
-          if(nr==4) { MADD(A0,B3,C3,T0); }// C3 = cj.pmadd(A0, B3, C3);
-                      A0 = ei_pload(&blA[2*PacketSize]);
-          if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
-                      A1 = ei_pload(&blA[3*PacketSize]);
-          if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
-                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
-                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
-                      B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
-                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
-                      B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
-          if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
-                      A0 = ei_pload(&blA[4*PacketSize]);
-          if(nr==4) { MADD(A1,B3,C7,T1); }// C7 = cj.pmadd(A1, B3, C7);
-                      A1 = ei_pload(&blA[5*PacketSize]);
-          if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
+                        CJMADD(A0,B0,C0,T0);
+            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
+                        CJMADD(A1,B0,C4,T1);
+            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                        CJMADD(A0,B1,C1,T0);
+                        CJMADD(A1,B1,C5,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A1,B2,C6,T1); }
+            if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T0); }
+                        A0 = ei_pload(&blA[2*PacketSize]);
+            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+                        A1 = ei_pload(&blA[3*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A1,B0,C4,T1);
+                        B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+                        CJMADD(A0,B1,C1,T0);
+                        CJMADD(A1,B1,C5,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A1,B2,C6,T1); }
+            if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T0); }
+                        A0 = ei_pload(&blA[4*PacketSize]);
+            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+                        A1 = ei_pload(&blA[5*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
 
-                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
-                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
-                      B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
-                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
-                      B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) { MADD(A1,B2,C6,T1); }// C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
-          if(nr==4) { MADD(A0,B3,C3,T0); } // C3 = cj.pmadd(A0, B3, C3);
-                      A0 = ei_pload(&blA[6*PacketSize]);
-          if(nr==4) { MADD(A1,B3,C7,T1); } // C7 = cj.pmadd(A1, B3, C7);
-                      A1 = ei_pload(&blA[7*PacketSize]);
-          if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
-                      MADD(A0,B0,C0,T0);  // C0 = cj.pmadd(A0, B0, C0);
-                      MADD(A1,B0,C4,T1);  // C4 = cj.pmadd(A1, B0, C4);
-                      MADD(A0,B1,C1,T0);  // C1 = cj.pmadd(A0, B1, C1);
-                      MADD(A1,B1,C5,T1);  // C5 = cj.pmadd(A1, B1, C5);
-          if(nr==4) { MADD(A0,B2,C2,T0); }// C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) { MADD(A1,B2,C6,T1); }//C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) { MADD(A0,B3,C3,T0); }//C3 = cj.pmadd(A0, B3, C3);
-          if(nr==4) { MADD(A1,B3,C7,T1); }//C7 = cj.pmadd(A1, B3, C7);
-
-          #endif
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A1,B0,C4,T1);
+                        B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+                        CJMADD(A0,B1,C1,T0);
+                        CJMADD(A1,B1,C5,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A1,B2,C6,T1); }
+            if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T0); }
+                        A0 = ei_pload(&blA[6*PacketSize]);
+            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+                        A1 = ei_pload(&blA[7*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A1,B0,C4,T1);
+                        CJMADD(A0,B1,C1,T0);
+                        CJMADD(A1,B1,C5,T1);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A1,B2,C6,T1); }
+            if(nr==4) { CJMADD(A0,B3,C3,T0); }
+            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+          }
 
           blB += 4*nr*PacketSize;
           blA += 4*mr;
@@ -208,22 +192,40 @@ struct ei_gebp_kernel
         // process remaining peeled loop
         for(int k=peeled_kc; k<depth; k++)
         {
-          PacketType B0, B1, B2, B3, A0, A1;
+          if(nr==2)
+          {
+            PacketType B0, T0, A0, A1;
 
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    A1 = ei_pload(&blA[1*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-                    C4 = cj.pmadd(A1, B0, C4);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    C5 = cj.pmadd(A1, B1, C5);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C6 = cj.pmadd(A1, B2, C6);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-          if(nr==4) C7 = cj.pmadd(A1, B3, C7);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            A1 = ei_pload(&blA[1*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,T0);
+            B0 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+            CJMADD(A1,B0,C5,T0);
+          }
+          else
+          {
+            PacketType B0, B1, B2, B3, A0, A1, T0, T1;
+
+                        A0 = ei_pload(&blA[0*PacketSize]);
+                        A1 = ei_pload(&blA[1*PacketSize]);
+                        B0 = ei_pload(&blB[0*PacketSize]);
+                        B1 = ei_pload(&blB[1*PacketSize]);
+
+                        CJMADD(A0,B0,C0,T0);
+            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
+                        CJMADD(A1,B0,C4,T1);
+            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                        CJMADD(A0,B1,C1,T0);
+                        CJMADD(A1,B1,C5,T1);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A1,B2,C6,T1); }
+            if(nr==4) { CJMADD(A0,B3,C3,T0); }
+            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+          }
 
           blB += nr*PacketSize;
           blA += mr;
@@ -257,45 +259,79 @@ struct ei_gebp_kernel
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         for(int k=0; k<peeled_kc; k+=4)
         {
-          PacketType B0, B1, B2, B3, A0;
+          if(nr==2)
+          {
+            PacketType B0, T0, A0;
 
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) B2 = ei_pload(&blB[6*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[1*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[7*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) B2 = ei_pload(&blB[10*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[2*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[11*PacketSize]);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B0 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
 
-                    C0 = cj.pmadd(A0, B0, C0);
-                    B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-                    B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) B2 = ei_pload(&blB[14*PacketSize]);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
-                    A0 = ei_pload(&blA[3*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[15*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-                    C1 = cj.pmadd(A0, B1, C1);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+            A0 = ei_pload(&blA[1*PacketSize]);
+            B0 = ei_pload(&blB[2*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B0 = ei_pload(&blB[3*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+
+            A0 = ei_pload(&blA[2*PacketSize]);
+            B0 = ei_pload(&blB[4*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B0 = ei_pload(&blB[5*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+
+            A0 = ei_pload(&blA[3*PacketSize]);
+            B0 = ei_pload(&blB[6*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B0 = ei_pload(&blB[7*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+          }
+          else
+          {
+
+            PacketType B0, B1, B2, B3, A0;
+            PacketType T0, T1;
+
+                        A0 = ei_pload(&blA[0*PacketSize]);
+                        B0 = ei_pload(&blB[0*PacketSize]);
+                        B1 = ei_pload(&blB[1*PacketSize]);
+
+                        CJMADD(A0,B0,C0,T0);
+            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
+                        CJMADD(A0,B1,C1,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+                        A0 = ei_pload(&blA[1*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
+                        CJMADD(A0,B0,C0,T0);
+                        B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
+                        CJMADD(A0,B1,C1,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+                        A0 = ei_pload(&blA[2*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
+
+                        CJMADD(A0,B0,C0,T0);
+                        B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
+                        CJMADD(A0,B1,C1,T1);
+                        B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+                        A0 = ei_pload(&blA[3*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A0,B1,C1,T1);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+          }
 
           blB += 4*nr*PacketSize;
           blA += 4*PacketSize;
@@ -303,17 +339,32 @@ struct ei_gebp_kernel
         // process remaining peeled loop
         for(int k=peeled_kc; k<depth; k++)
         {
-          PacketType B0, B1, B2, B3, A0;
+          if(nr==2)
+          {
+            PacketType B0, T0, A0;
 
-                    A0 = ei_pload(&blA[0*PacketSize]);
-                    B0 = ei_pload(&blB[0*PacketSize]);
-                    B1 = ei_pload(&blB[1*PacketSize]);
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 = ei_pload(&blB[2*PacketSize]);
-          if(nr==4) B3 = ei_pload(&blB[3*PacketSize]);
-                    C1 = cj.pmadd(A0, B1, C1);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B0 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C1,T0);
+          }
+          else
+          {
+            PacketType B0, B1, B2, B3, A0;
+            PacketType T0, T1;
+
+                        A0 = ei_pload(&blA[0*PacketSize]);
+                        B0 = ei_pload(&blB[0*PacketSize]);
+                        B1 = ei_pload(&blB[1*PacketSize]);
+            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
+            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A0,B1,C1,T1);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+          }
 
           blB += nr*PacketSize;
           blA += PacketSize;
@@ -336,17 +387,32 @@ struct ei_gebp_kernel
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         for(int k=0; k<depth; k++)
         {
-          Scalar B0, B1, B2, B3, A0;
+          if(nr==2)
+          {
+            Scalar B0, T0, A0;
 
-                    A0 =  blA[k];
-                    B0 =  blB[0*PacketSize];
-                    B1 =  blB[1*PacketSize];
-                    C0 = cj.pmadd(A0, B0, C0);
-          if(nr==4) B2 =  blB[2*PacketSize];
-          if(nr==4) B3 =  blB[3*PacketSize];
-                    C1 = cj.pmadd(A0, B1, C1);
-          if(nr==4) C2 = cj.pmadd(A0, B2, C2);
-          if(nr==4) C3 = cj.pmadd(A0, B3, C3);
+            A0 = blA[0*PacketSize];
+            B0 = blB[0*PacketSize];
+            CJMADD(A0,B0,C0,T0);
+            B0 = blB[1*PacketSize];
+            CJMADD(A0,B0,C1,T0);
+          }
+          else
+          {
+            Scalar B0, B1, B2, B3, A0;
+            Scalar T0, T1;
+
+                        A0 = blA[k];
+                        B0 = blB[0*PacketSize];
+                        B1 = blB[1*PacketSize];
+            if(nr==4)   B2 = blB[2*PacketSize];
+            if(nr==4)   B3 = blB[3*PacketSize];
+
+                        CJMADD(A0,B0,C0,T0);
+                        CJMADD(A0,B1,C1,T1);
+            if(nr==4) { CJMADD(A0,B2,C2,T0); }
+            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+          }
 
           blB += nr*PacketSize;
         }
@@ -378,13 +444,13 @@ struct ei_gebp_kernel
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
         for(int k=0; k<depth; k++)
         {
-          PacketType B0, A0, A1;
+          PacketType B0, A0, A1, T0, T1;
 
           A0 = ei_pload(&blA[0*PacketSize]);
           A1 = ei_pload(&blA[1*PacketSize]);
           B0 = ei_pload(&blB[0*PacketSize]);
-          C0 = cj.pmadd(A0, B0, C0);
-          C4 = cj.pmadd(A1, B0, C4);
+          CJMADD(A0,B0,C0,T0);
+          CJMADD(A1,B0,C4,T1);
 
           blB += PacketSize;
           blA += mr;
@@ -431,6 +497,8 @@ struct ei_gebp_kernel
   }
 };
 
+#undef CJMADD
+
 // pack a block of the lhs
 // The travesal is as follow (mr==4):
 //   0  4  8 12 ...

From d579d4cc37693823d03fbfedd2e48c40dcaf8938 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 17:57:15 +0100
Subject: [PATCH 010/122] oops

---
 bench/bench_gemm.cpp      | 2 +-
 bench/bench_gemm_blas.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index d110a0fcd..e99fc2970 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -3,7 +3,7 @@
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
 #include <Eigen/Core>
-#include "../../eigen2/bench/BenchTimer.h"
+#include <bench/BenchTimer.h>
 
 using namespace std;
 using namespace Eigen;
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
index 45e02e59f..a9dfaa66f 100644
--- a/bench/bench_gemm_blas.cpp
+++ b/bench/bench_gemm_blas.cpp
@@ -1,6 +1,6 @@
 
 #include <Eigen/Core>
-#include <../eigen2/bench/BenchTimer.h>
+#include <bench/BenchTimer.h>
 
 extern "C"
 {

From eb905500b6c654860aa9f9d9c77c7c2614e0ad10 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 23 Feb 2010 13:06:49 +0100
Subject: [PATCH 011/122] significant speedup in the matrix-matrix products

---
 Eigen/src/Core/arch/SSE/PacketMath.h          |  39 +-
 .../Core/products/GeneralBlockPanelKernel.h   | 545 ++++++++++--------
 Eigen/src/Core/products/GeneralMatrixMatrix.h |   8 +-
 .../Core/products/SelfadjointMatrixMatrix.h   |  66 ++-
 .../Core/products/TriangularMatrixMatrix.h    |  30 +-
 .../Core/products/TriangularSolverMatrix.h    |  22 +-
 Eigen/src/Core/util/Macros.h                  |   2 +-
 bench/bench_gemm.cpp                          |   4 +-
 bench/bench_gemm_blas.cpp                     |  27 +-
 9 files changed, 429 insertions(+), 314 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a5a56f759..de96aaffa 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -184,11 +184,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) {
 template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
-#if (!defined __GNUC__) && (!defined __ICC)
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
-#else
+// #if (!defined __GNUC__) && (!defined __ICC)
+// template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
+// template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+// template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+// #else
+
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
 // because of the strict aliasing rule. The "dummy" stuff are required to enforce
@@ -197,28 +198,27 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_
 template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  __m128 res;
-  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
-  asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
-  return res;
+  __m128d res;
+  res =  _mm_load_sd((const double*)(from)) ;
+  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
+  return _mm_castpd_ps(res);
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
   __m128d res;
-  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) );
-  asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) );
+  res = _mm_load_sd(from) ;
+  res = _mm_loadh_pd(res,from+1);
   return res;
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  __m128i res;
-  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
-  asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
-  return res;
+  __m128d res;
+  res =  _mm_load_sd((const double*)(from)) ;
+  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
+  return _mm_castpd_si128(res);
 }
-#endif
 
 template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
@@ -277,6 +277,13 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
   #endif
 }
 
+EIGEN_STRONG_INLINE void ei_punpackp(Packet4f* vecs)
+{
+  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
+  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
+  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
+  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
+}
 
 #ifdef __SSE3__
 // TODO implement SSE2 versions as well as integer versions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 18e913b0e..dfc92c346 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -30,7 +30,7 @@
 #ifdef EIGEN_HAS_FUSE_CJMADD
 #define CJMADD(A,B,C,T)  C = cj.pmadd(A,B,C);
 #else
-#define CJMADD(A,B,C,T)  T = A; T = cj.pmul(T,B); C = ei_padd(C,T);
+#define CJMADD(A,B,C,T)  T = B; T = cj.pmul(A,T); C = ei_padd(C,T);
 #endif
 
 // optimized GEneral packed Block * packed Panel product kernel
@@ -48,9 +48,66 @@ struct ei_gebp_kernel
     const int peeled_mc  = (rows/mr)*mr;
     const int peeled_mc2  = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
     const int peeled_kc = (depth/4)*4;
+
+    Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
+    
     // loops on each micro vertical panel of rhs (depth x nr)
     for(int j2=0; j2<packet_cols; j2+=nr)
     {
+      // unpack B
+      {
+        const Scalar* blB = &blockB[j2*strideB+offsetB*nr];
+        int n = depth*nr;
+        for(int k=0; k<n; k++)
+          ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
+        /*Scalar* dest = unpackedB;
+        for(int k=0; k<n; k+=4*PacketSize)
+        {
+          #ifdef EIGEN_VECTORIZE_SSE
+          const int S = 128;
+          const int G = 16;
+          _mm_prefetch((const char*)(&blB[S/2+0]), _MM_HINT_T0);
+          _mm_prefetch((const char*)(&dest[S+0*G]), _MM_HINT_T0);
+          _mm_prefetch((const char*)(&dest[S+1*G]), _MM_HINT_T0);
+          _mm_prefetch((const char*)(&dest[S+2*G]), _MM_HINT_T0);
+          _mm_prefetch((const char*)(&dest[S+3*G]), _MM_HINT_T0);
+          #endif
+
+          PacketType C0[PacketSize], C1[PacketSize], C2[PacketSize], C3[PacketSize];
+          C0[0] = ei_pload(blB+0*PacketSize);
+          C1[0] = ei_pload(blB+1*PacketSize);
+          C2[0] = ei_pload(blB+2*PacketSize);
+          C3[0] = ei_pload(blB+3*PacketSize);
+
+          ei_punpackp(C0);
+          ei_punpackp(C1);
+          ei_punpackp(C2);
+          ei_punpackp(C3);
+
+          ei_pstore(dest+ 0*PacketSize, C0[0]);
+          ei_pstore(dest+ 1*PacketSize, C0[1]);
+          ei_pstore(dest+ 2*PacketSize, C0[2]);
+          ei_pstore(dest+ 3*PacketSize, C0[3]);
+
+          ei_pstore(dest+ 4*PacketSize, C1[0]);
+          ei_pstore(dest+ 5*PacketSize, C1[1]);
+          ei_pstore(dest+ 6*PacketSize, C1[2]);
+          ei_pstore(dest+ 7*PacketSize, C1[3]);
+
+          ei_pstore(dest+ 8*PacketSize, C2[0]);
+          ei_pstore(dest+ 9*PacketSize, C2[1]);
+          ei_pstore(dest+10*PacketSize, C2[2]);
+          ei_pstore(dest+11*PacketSize, C2[3]);
+
+          ei_pstore(dest+12*PacketSize, C3[0]);
+          ei_pstore(dest+13*PacketSize, C3[1]);
+          ei_pstore(dest+14*PacketSize, C3[2]);
+          ei_pstore(dest+15*PacketSize, C3[3]);
+
+          blB += 4*PacketSize;
+          dest += 16*PacketSize;
+        }*/
+      }
       // loops on each micro horizontal panel of lhs (mr x depth)
       // => we select a mr x nr micro block of res which is entirely
       //    stored into mr/packet_size x nr registers.
@@ -65,19 +122,31 @@ struct ei_gebp_kernel
 
         // gets res block as register
         PacketType C0, C1, C2, C3, C4, C5, C6, C7;
-                  C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
-                  C1 = ei_ploadu(&res[(j2+1)*resStride + i]);
-        if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]);
-        if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
-                  C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
-                  C5 = ei_ploadu(&res[(j2+1)*resStride + i + PacketSize]);
-        if(nr==4) C6 = ei_ploadu(&res[(j2+2)*resStride + i + PacketSize]);
-        if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
+                  C0 = ei_pset1(Scalar(0));
+                  C1 = ei_pset1(Scalar(0));
+        if(nr==4) C2 = ei_pset1(Scalar(0));
+        if(nr==4) C3 = ei_pset1(Scalar(0));
+                  C4 = ei_pset1(Scalar(0));
+                  C5 = ei_pset1(Scalar(0));
+        if(nr==4) C6 = ei_pset1(Scalar(0));
+        if(nr==4) C7 = ei_pset1(Scalar(0));
+
+        Scalar* r0 = &res[(j2+0)*resStride + i];
+        Scalar* r1 = r0 + resStride;
+        Scalar* r2 = r1 + resStride;
+        Scalar* r3 = r2 + resStride;
+
+        #ifdef EIGEN_VECTORIZE_SSE
+        _mm_prefetch((const char*)(r0+16), _MM_HINT_T0);
+        _mm_prefetch((const char*)(r1+16), _MM_HINT_T0);
+        _mm_prefetch((const char*)(r2+16), _MM_HINT_T0);
+        _mm_prefetch((const char*)(r3+16), _MM_HINT_T0);
+        #endif
 
         // performs "inner" product
         // TODO let's check wether the flowing peeled loop could not be
         //      optimized via optimal prefetching from one loop to the other
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
+        const Scalar* blB = unpackedB;
         for(int k=0; k<peeled_kc; k+=4)
         {
           if(nr==2)
@@ -88,102 +157,101 @@ struct ei_gebp_kernel
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
             CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,T0);
+            CJMADD(A1,B0,C4,B0);
             B0 = ei_pload(&blB[1*PacketSize]);
             CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,T0);
+            CJMADD(A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[2*PacketSize]);
             A1 = ei_pload(&blA[3*PacketSize]);
             B0 = ei_pload(&blB[2*PacketSize]);
             CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,T0);
+            CJMADD(A1,B0,C4,B0);
             B0 = ei_pload(&blB[3*PacketSize]);
             CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,T0);
+            CJMADD(A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[4*PacketSize]);
             A1 = ei_pload(&blA[5*PacketSize]);
             B0 = ei_pload(&blB[4*PacketSize]);
             CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,T0);
+            CJMADD(A1,B0,C4,B0);
             B0 = ei_pload(&blB[5*PacketSize]);
             CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,T0);
+            CJMADD(A1,B0,C5,B0);
 
             A0 = ei_pload(&blA[6*PacketSize]);
             A1 = ei_pload(&blA[7*PacketSize]);
             B0 = ei_pload(&blB[6*PacketSize]);
             CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,T0);
+            CJMADD(A1,B0,C4,B0);
             B0 = ei_pload(&blB[7*PacketSize]);
             CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,T0);
+            CJMADD(A1,B0,C5,B0);
           }
           else
           {
-
             PacketType B0, B1, B2, B3, A0, A1;
-            PacketType T0, T1;
+            PacketType T0;
+            
+            A0 = ei_pload(&blA[0*PacketSize]);
+            A1 = ei_pload(&blA[1*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            B1 = ei_pload(&blB[1*PacketSize]);
 
-                        A0 = ei_pload(&blA[0*PacketSize]);
-                        A1 = ei_pload(&blA[1*PacketSize]);
-                        B0 = ei_pload(&blB[0*PacketSize]);
-                        B1 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            B2 = ei_pload(&blB[2*PacketSize]);
+            CJMADD(A1,B0,C4,B0);
+            B3 = ei_pload(&blB[3*PacketSize]);
+            B0 = ei_pload(&blB[4*PacketSize]);
+            CJMADD(A0,B1,C1,T0);
+            CJMADD(A1,B1,C5,B1);
+            B1 = ei_pload(&blB[5*PacketSize]);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A1,B2,C6,B2);
+            B2 = ei_pload(&blB[6*PacketSize]);
+            CJMADD(A0,B3,C3,T0);
+            A0 = ei_pload(&blA[2*PacketSize]);
+            CJMADD(A1,B3,C7,B3);
+            A1 = ei_pload(&blA[3*PacketSize]);
+            B3 = ei_pload(&blB[7*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,B0);
+            B0 = ei_pload(&blB[8*PacketSize]);
+            CJMADD(A0,B1,C1,T0);
+            CJMADD(A1,B1,C5,B1);
+            B1 = ei_pload(&blB[9*PacketSize]);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A1,B2,C6,B2);
+            B2 = ei_pload(&blB[10*PacketSize]);
+            CJMADD(A0,B3,C3,T0);
+            A0 = ei_pload(&blA[4*PacketSize]);
+            CJMADD(A1,B3,C7,B3);
+            A1 = ei_pload(&blA[5*PacketSize]);
+            B3 = ei_pload(&blB[11*PacketSize]);
 
-                        CJMADD(A0,B0,C0,T0);
-            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
-                        CJMADD(A1,B0,C4,T1);
-            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
-                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                        CJMADD(A0,B1,C1,T0);
-                        CJMADD(A1,B1,C5,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A1,B2,C6,T1); }
-            if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T0); }
-                        A0 = ei_pload(&blA[2*PacketSize]);
-            if(nr==4) { CJMADD(A1,B3,C7,T1); }
-                        A1 = ei_pload(&blA[3*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A1,B0,C4,T1);
-                        B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                        CJMADD(A0,B1,C1,T0);
-                        CJMADD(A1,B1,C5,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A1,B2,C6,T1); }
-            if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T0); }
-                        A0 = ei_pload(&blA[4*PacketSize]);
-            if(nr==4) { CJMADD(A1,B3,C7,T1); }
-                        A1 = ei_pload(&blA[5*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
-
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A1,B0,C4,T1);
-                        B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                        CJMADD(A0,B1,C1,T0);
-                        CJMADD(A1,B1,C5,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A1,B2,C6,T1); }
-            if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T0); }
-                        A0 = ei_pload(&blA[6*PacketSize]);
-            if(nr==4) { CJMADD(A1,B3,C7,T1); }
-                        A1 = ei_pload(&blA[7*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A1,B0,C4,T1);
-                        CJMADD(A0,B1,C1,T0);
-                        CJMADD(A1,B1,C5,T1);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A1,B2,C6,T1); }
-            if(nr==4) { CJMADD(A0,B3,C3,T0); }
-            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,B0);
+            B0 = ei_pload(&blB[12*PacketSize]);
+            CJMADD(A0,B1,C1,T0);
+            CJMADD(A1,B1,C5,B1);
+            B1 = ei_pload(&blB[13*PacketSize]);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A1,B2,C6,B2);
+            B2 = ei_pload(&blB[14*PacketSize]);
+            CJMADD(A0,B3,C3,T0);
+            A0 = ei_pload(&blA[6*PacketSize]);
+            CJMADD(A1,B3,C7,B3);
+            A1 = ei_pload(&blA[7*PacketSize]);
+            B3 = ei_pload(&blB[15*PacketSize]);
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A1,B0,C4,B0);
+            CJMADD(A0,B1,C1,T0);
+            CJMADD(A1,B1,C5,B1);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A1,B2,C6,B2);
+            CJMADD(A0,B3,C3,T0);
+            CJMADD(A1,B3,C7,B3);
           }
 
           blB += 4*nr*PacketSize;
@@ -200,45 +268,64 @@ struct ei_gebp_kernel
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
             CJMADD(A0,B0,C0,T0);
-            CJMADD(A1,B0,C4,T0);
+            CJMADD(A1,B0,C4,B0);
             B0 = ei_pload(&blB[1*PacketSize]);
             CJMADD(A0,B0,C1,T0);
-            CJMADD(A1,B0,C5,T0);
+            CJMADD(A1,B0,C5,B0);
           }
           else
           {
-            PacketType B0, B1, B2, B3, A0, A1, T0, T1;
+            PacketType B0, B1, B2, B3, A0, A1, T0;
 
-                        A0 = ei_pload(&blA[0*PacketSize]);
-                        A1 = ei_pload(&blA[1*PacketSize]);
-                        B0 = ei_pload(&blB[0*PacketSize]);
-                        B1 = ei_pload(&blB[1*PacketSize]);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            A1 = ei_pload(&blA[1*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            B1 = ei_pload(&blB[1*PacketSize]);
 
-                        CJMADD(A0,B0,C0,T0);
-            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
-                        CJMADD(A1,B0,C4,T1);
-            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
-                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                        CJMADD(A0,B1,C1,T0);
-                        CJMADD(A1,B1,C5,T1);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A1,B2,C6,T1); }
-            if(nr==4) { CJMADD(A0,B3,C3,T0); }
-            if(nr==4) { CJMADD(A1,B3,C7,T1); }
+            CJMADD(A0,B0,C0,T0);
+            B2 = ei_pload(&blB[2*PacketSize]);
+            CJMADD(A1,B0,C4,B0);
+            B3 = ei_pload(&blB[3*PacketSize]);
+            CJMADD(A0,B1,C1,T0);
+            CJMADD(A1,B1,C5,B1);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A1,B2,C6,B2);
+            CJMADD(A0,B3,C3,T0);
+            CJMADD(A1,B3,C7,B3);
           }
 
           blB += nr*PacketSize;
           blA += mr;
         }
 
-                  ei_pstoreu(&res[(j2+0)*resStride + i], C0);
-                  ei_pstoreu(&res[(j2+1)*resStride + i], C1);
-        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2);
-        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3);
-                  ei_pstoreu(&res[(j2+0)*resStride + i + PacketSize], C4);
-                  ei_pstoreu(&res[(j2+1)*resStride + i + PacketSize], C5);
-        if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i + PacketSize], C6);
-        if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i + PacketSize], C7);
+        PacketType R0, R1, R2, R3, R4, R5, R6, R7;
+
+                  R0 = ei_ploadu(r0);
+                  R1 = ei_ploadu(r1);
+        if(nr==4) R2 = ei_ploadu(r2);
+        if(nr==4) R3 = ei_ploadu(r3);
+                  R4 = ei_ploadu(r0 + PacketSize);
+                  R5 = ei_ploadu(r1 + PacketSize);
+        if(nr==4) R6 = ei_ploadu(r2 + PacketSize);
+        if(nr==4) R7 = ei_ploadu(r3 + PacketSize);
+
+                  C0 = ei_padd(R0, C0);
+                  C1 = ei_padd(R1, C1);
+        if(nr==4) C2 = ei_padd(R2, C2);
+        if(nr==4) C3 = ei_padd(R3, C3);
+                  C4 = ei_padd(R4, C4);
+                  C5 = ei_padd(R5, C5);
+        if(nr==4) C6 = ei_padd(R6, C6);
+        if(nr==4) C7 = ei_padd(R7, C7);
+
+                  ei_pstoreu(r0, C0);
+                  ei_pstoreu(r1, C1);
+        if(nr==4) ei_pstoreu(r2, C2);
+        if(nr==4) ei_pstoreu(r3, C3);
+                  ei_pstoreu(r0 + PacketSize, C4);
+                  ei_pstoreu(r1 + PacketSize, C5);
+        if(nr==4) ei_pstoreu(r2 + PacketSize, C6);
+        if(nr==4) ei_pstoreu(r3 + PacketSize, C7);
       }
       if(rows-peeled_mc>=PacketSize)
       {
@@ -256,81 +343,76 @@ struct ei_gebp_kernel
         if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]);
 
         // performs "inner" product
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
+        const Scalar* blB = unpackedB;
         for(int k=0; k<peeled_kc; k+=4)
         {
           if(nr==2)
           {
-            PacketType B0, T0, A0;
+            PacketType B0, B1, A0;
 
             A0 = ei_pload(&blA[0*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            B0 = ei_pload(&blB[1*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-
-            A0 = ei_pload(&blA[1*PacketSize]);
+            B1 = ei_pload(&blB[1*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
             B0 = ei_pload(&blB[2*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            B0 = ei_pload(&blB[3*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-
-            A0 = ei_pload(&blA[2*PacketSize]);
+            CJMADD(A0,B1,C1,B1);
+            A0 = ei_pload(&blA[1*PacketSize]);
+            B1 = ei_pload(&blB[3*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
             B0 = ei_pload(&blB[4*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            B0 = ei_pload(&blB[5*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
-
-            A0 = ei_pload(&blA[3*PacketSize]);
+            CJMADD(A0,B1,C1,B1);
+            A0 = ei_pload(&blA[2*PacketSize]);
+            B1 = ei_pload(&blB[5*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
             B0 = ei_pload(&blB[6*PacketSize]);
-            CJMADD(A0,B0,C0,T0);
-            B0 = ei_pload(&blB[7*PacketSize]);
-            CJMADD(A0,B0,C1,T0);
+            CJMADD(A0,B1,C1,B1);
+            A0 = ei_pload(&blA[3*PacketSize]);
+            B1 = ei_pload(&blB[7*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
+            CJMADD(A0,B1,C1,B1);
           }
           else
           {
-
             PacketType B0, B1, B2, B3, A0;
-            PacketType T0, T1;
 
-                        A0 = ei_pload(&blA[0*PacketSize]);
-                        B0 = ei_pload(&blB[0*PacketSize]);
-                        B1 = ei_pload(&blB[1*PacketSize]);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            B1 = ei_pload(&blB[1*PacketSize]);
 
-                        CJMADD(A0,B0,C0,T0);
-            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
-                        B0 = ei_pload(&blB[(nr==4 ? 4 : 2)*PacketSize]);
-                        CJMADD(A0,B1,C1,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 5 : 3)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4)   B2 = ei_pload(&blB[6*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
-                        A0 = ei_pload(&blA[1*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[7*PacketSize]);
-                        CJMADD(A0,B0,C0,T0);
-                        B0 = ei_pload(&blB[(nr==4 ? 8 : 4)*PacketSize]);
-                        CJMADD(A0,B1,C1,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 9 : 5)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4)   B2 = ei_pload(&blB[10*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
-                        A0 = ei_pload(&blA[2*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[11*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
+            B2 = ei_pload(&blB[2*PacketSize]);
+            B3 = ei_pload(&blB[3*PacketSize]);
+            B0 = ei_pload(&blB[4*PacketSize]);
+            CJMADD(A0,B1,C1,B1);
+            B1 = ei_pload(&blB[5*PacketSize]);
+            CJMADD(A0,B2,C2,B2);
+            B2 = ei_pload(&blB[6*PacketSize]);
+            CJMADD(A0,B3,C3,B3);
+            A0 = ei_pload(&blA[1*PacketSize]);
+            B3 = ei_pload(&blB[7*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
+            B0 = ei_pload(&blB[8*PacketSize]);
+            CJMADD(A0,B1,C1,B1);
+            B1 = ei_pload(&blB[9*PacketSize]);
+            CJMADD(A0,B2,C2,B2);
+            B2 = ei_pload(&blB[10*PacketSize]);
+            CJMADD(A0,B3,C3,B3);
+            A0 = ei_pload(&blA[2*PacketSize]);
+            B3 = ei_pload(&blB[11*PacketSize]);
 
-                        CJMADD(A0,B0,C0,T0);
-                        B0 = ei_pload(&blB[(nr==4 ? 12 : 6)*PacketSize]);
-                        CJMADD(A0,B1,C1,T1);
-                        B1 = ei_pload(&blB[(nr==4 ? 13 : 7)*PacketSize]);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4)   B2 = ei_pload(&blB[14*PacketSize]);
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
-                        A0 = ei_pload(&blA[3*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[15*PacketSize]);
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A0,B1,C1,T1);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+            CJMADD(A0,B0,C0,B0);
+            B0 = ei_pload(&blB[12*PacketSize]);
+            CJMADD(A0,B1,C1,B1);
+            B1 = ei_pload(&blB[13*PacketSize]);
+            CJMADD(A0,B2,C2,B2);
+            B2 = ei_pload(&blB[14*PacketSize]);
+            CJMADD(A0,B3,C3,B3);
+            A0 = ei_pload(&blA[3*PacketSize]);
+            B3 = ei_pload(&blB[15*PacketSize]);
+            CJMADD(A0,B0,C0,B0);
+            CJMADD(A0,B1,C1,B1);
+            CJMADD(A0,B2,C2,B2);
+            CJMADD(A0,B3,C3,B3);
           }
 
           blB += 4*nr*PacketSize;
@@ -354,16 +436,16 @@ struct ei_gebp_kernel
             PacketType B0, B1, B2, B3, A0;
             PacketType T0, T1;
 
-                        A0 = ei_pload(&blA[0*PacketSize]);
-                        B0 = ei_pload(&blB[0*PacketSize]);
-                        B1 = ei_pload(&blB[1*PacketSize]);
-            if(nr==4)   B2 = ei_pload(&blB[2*PacketSize]);
-            if(nr==4)   B3 = ei_pload(&blB[3*PacketSize]);
+            A0 = ei_pload(&blA[0*PacketSize]);
+            B0 = ei_pload(&blB[0*PacketSize]);
+            B1 = ei_pload(&blB[1*PacketSize]);
+            B2 = ei_pload(&blB[2*PacketSize]);
+            B3 = ei_pload(&blB[3*PacketSize]);
 
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A0,B1,C1,T1);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A0,B1,C1,T1);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A0,B3,C3,T1);
           }
 
           blB += nr*PacketSize;
@@ -384,7 +466,8 @@ struct ei_gebp_kernel
 
         // gets a 1 x nr res block as registers
         Scalar C0(0), C1(0), C2(0), C3(0);
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
+        // TODO directly use blockB ???
+        const Scalar* blB = unpackedB;//&blockB[j2*strideB+offsetB*nr];
         for(int k=0; k<depth; k++)
         {
           if(nr==2)
@@ -402,16 +485,16 @@ struct ei_gebp_kernel
             Scalar B0, B1, B2, B3, A0;
             Scalar T0, T1;
 
-                        A0 = blA[k];
-                        B0 = blB[0*PacketSize];
-                        B1 = blB[1*PacketSize];
-            if(nr==4)   B2 = blB[2*PacketSize];
-            if(nr==4)   B3 = blB[3*PacketSize];
+            A0 = blA[k];
+            B0 = blB[0*PacketSize];
+            B1 = blB[1*PacketSize];
+            B2 = blB[2*PacketSize];
+            B3 = blB[3*PacketSize];
 
-                        CJMADD(A0,B0,C0,T0);
-                        CJMADD(A0,B1,C1,T1);
-            if(nr==4) { CJMADD(A0,B2,C2,T0); }
-            if(nr==4) { CJMADD(A0,B3,C3,T1); }
+            CJMADD(A0,B0,C0,T0);
+            CJMADD(A0,B1,C1,T1);
+            CJMADD(A0,B2,C2,T0);
+            CJMADD(A0,B3,C3,T1);
           }
 
           blB += nr*PacketSize;
@@ -427,6 +510,13 @@ struct ei_gebp_kernel
     // => do the same but with nr==1
     for(int j2=packet_cols; j2<cols; j2++)
     {
+      // unpack B
+      {
+        const Scalar* blB = &blockB[j2*strideB+offsetB];
+        for(int k=0; k<depth; k++)
+          ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
+      }
+      
       for(int i=0; i<peeled_mc; i+=mr)
       {
         const Scalar* blA = &blockA[i*strideA+offsetA*mr];
@@ -436,12 +526,12 @@ struct ei_gebp_kernel
 
         // TODO move the res loads to the stores
 
-        // gets res block as register
+        // get res block as registers
         PacketType C0, C4;
         C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
         C4 = ei_ploadu(&res[(j2+0)*resStride + i + PacketSize]);
 
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
+        const Scalar* blB = unpackedB;
         for(int k=0; k<depth; k++)
         {
           PacketType B0, A0, A1, T0, T1;
@@ -469,7 +559,7 @@ struct ei_gebp_kernel
 
         PacketType C0 = ei_ploadu(&res[(j2+0)*resStride + i]);
 
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
+        const Scalar* blB = unpackedB;
         for(int k=0; k<depth; k++)
         {
           C0 = cj.pmadd(ei_pload(blA), ei_pload(blB), C0);
@@ -488,7 +578,8 @@ struct ei_gebp_kernel
 
         // gets a 1 x 1 res block as registers
         Scalar C0(0);
-        const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB];
+        // FIXME directly use blockB ??
+        const Scalar* blB = unpackedB;
         for(int k=0; k<depth; k++)
           C0 = cj.pmadd(blA[k], blB[k*PacketSize], C0);
         res[(j2+0)*resStride + i] += C0;
@@ -552,9 +643,9 @@ struct ei_gemm_pack_lhs
   }
 };
 
-// copy a complete panel of the rhs while expending each coefficient into a packet form
+// copy a complete panel of the rhs
 // this version is optimized for column major matrices
-// The traversal order is as follow (nr==4):
+// The traversal order is as follow: (nr==4):
 //  0  1  2  3   12 13 14 15   24 27
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
@@ -574,65 +665,51 @@ struct ei_gemm_pack_rhs<Scalar, nr, ColMajor, PanelMode>
     for(int j2=0; j2<packet_cols; j2+=nr)
     {
       // skip what we have before
-      if(PanelMode) count += PacketSize * nr * offset;
+      if(PanelMode) count += nr * offset;
       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
       if (hasAlpha)
-      {
         for(int k=0; k<depth; k++)
         {
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[k]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b1[k]));
-          if (nr==4)
-          {
-            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b2[k]));
-            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b3[k]));
-          }
-          count += nr*PacketSize;
+                    blockB[count+0] = alpha*b0[k];
+                    blockB[count+1] = alpha*b1[k];
+          if(nr==4) blockB[count+2] = alpha*b2[k];
+          if(nr==4) blockB[count+3] = alpha*b3[k];
+          count += nr;
         }
-      }
       else
-      {
         for(int k=0; k<depth; k++)
         {
-          ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[k]));
-          ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b1[k]));
-          if (nr==4)
-          {
-            ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b2[k]));
-            ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b3[k]));
-          }
-          count += nr*PacketSize;
+                    blockB[count+0] = b0[k];
+                    blockB[count+1] = b1[k];
+          if(nr==4) blockB[count+2] = b2[k];
+          if(nr==4) blockB[count+3] = b3[k];
+          count += nr;
         }
-      }
       // skip what we have after
-      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
+      if(PanelMode) count += nr * (stride-offset-depth);
     }
 
     // copy the remaining columns one at a time (nr==1)
     for(int j2=packet_cols; j2<cols; ++j2)
     {
-      if(PanelMode) count += PacketSize * offset;
+      if(PanelMode) count += offset;
       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
       if (hasAlpha)
-      {
         for(int k=0; k<depth; k++)
         {
-          ei_pstore(&blockB[count], ei_pset1(alpha*b0[k]));
-          count += PacketSize;
+          blockB[count] = alpha*b0[k];
+          count += 1;
         }
-      }
       else
-      {
         for(int k=0; k<depth; k++)
         {
-          ei_pstore(&blockB[count], ei_pset1(b0[k]));
-          count += PacketSize;
+          blockB[count] = b0[k];
+          count += 1;
         }
-      }
-      if(PanelMode) count += PacketSize * (stride-offset-depth);
+      if(PanelMode) count += (stride-offset-depth);
     }
   }
 };
@@ -652,17 +729,17 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
     for(int j2=0; j2<packet_cols; j2+=nr)
     {
       // skip what we have before
-      if(PanelMode) count += PacketSize * nr * offset;
+      if(PanelMode) count += nr * offset;
       if (hasAlpha)
       {
         for(int k=0; k<depth; k++)
         {
           const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*b0[0]));
-                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*b0[1]));
-          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*b0[2]));
-          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*b0[3]));
-          count += nr*PacketSize;
+                    blockB[count+0] = alpha*b0[0];
+                    blockB[count+1] = alpha*b0[1];
+          if(nr==4) blockB[count+2] = alpha*b0[2];
+          if(nr==4) blockB[count+3] = alpha*b0[3];
+          count += nr;
         }
       }
       else
@@ -670,27 +747,27 @@ struct ei_gemm_pack_rhs<Scalar, nr, RowMajor, PanelMode>
         for(int k=0; k<depth; k++)
         {
           const Scalar* b0 = &rhs[k*rhsStride + j2];
-                    ei_pstore(&blockB[count+0*PacketSize], ei_pset1(b0[0]));
-                    ei_pstore(&blockB[count+1*PacketSize], ei_pset1(b0[1]));
-          if(nr==4) ei_pstore(&blockB[count+2*PacketSize], ei_pset1(b0[2]));
-          if(nr==4) ei_pstore(&blockB[count+3*PacketSize], ei_pset1(b0[3]));
-          count += nr*PacketSize;
+                    blockB[count+0] = b0[0];
+                    blockB[count+1] = b0[1];
+          if(nr==4) blockB[count+2] = b0[2];
+          if(nr==4) blockB[count+3] = b0[3];
+          count += nr;
         }
       }
       // skip what we have after
-      if(PanelMode) count += PacketSize * nr * (stride-offset-depth);
+      if(PanelMode) count += nr * (stride-offset-depth);
     }
     // copy the remaining columns one at a time (nr==1)
     for(int j2=packet_cols; j2<cols; ++j2)
     {
-      if(PanelMode) count += PacketSize * offset;
+      if(PanelMode) count += offset;
       const Scalar* b0 = &rhs[j2];
       for(int k=0; k<depth; k++)
       {
-        ei_pstore(&blockB[count], ei_pset1(alpha*b0[k*rhsStride]));
-        count += PacketSize;
+        blockB[count] = alpha*b0[k*rhsStride];
+        count += 1;
       }
-      if(PanelMode) count += PacketSize * (stride-offset-depth);
+      if(PanelMode) count += stride-offset-depth;
     }
   }
 };
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c13e09eac..25c8d4c96 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -78,8 +78,10 @@ static void run(int rows, int cols, int depth,
   int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
   int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
-  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-  Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc*8);
+  std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+  Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+  Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
   // For each horizontal panel of the rhs, and corresponding panel of the lhs...
   // (==GEMM_VAR1)
@@ -111,7 +113,7 @@ static void run(int rows, int cols, int depth,
   }
 
   ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-  ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+  ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
 }
 
 };
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 89cbc3ac0..785045db4 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -95,14 +95,14 @@ struct ei_symm_pack_rhs
     {
       for(int k=k2; k<end_k; k++)
       {
-        ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
-        ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
+        blockB[count+0] = alpha*rhs(k,j2+0);
+        blockB[count+1] = alpha*rhs(k,j2+1);
         if (nr==4)
         {
-          ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
-          ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
+          blockB[count+2] = alpha*rhs(k,j2+2);
+          blockB[count+3] = alpha*rhs(k,j2+3);
         }
-        count += nr*PacketSize;
+        count += nr;
       }
     }
 
@@ -113,14 +113,14 @@ struct ei_symm_pack_rhs
       // transpose
       for(int k=k2; k<j2; k++)
       {
-        ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k))));
-        ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k))));
+        blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
+        blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
         if (nr==4)
         {
-          ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k))));
-          ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k))));
+          blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
+          blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
         }
-        count += nr*PacketSize;
+        count += nr;
       }
       // symmetric
       int h = 0;
@@ -128,24 +128,24 @@ struct ei_symm_pack_rhs
       {
         // normal
         for (int w=0 ; w<h; ++w)
-          ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w)));
+          blockB[count+w] = alpha*rhs(k,j2+w);
         // transpose
         for (int w=h ; w<nr; ++w)
-          ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+w,k))));
-        count += nr*PacketSize;
+          blockB[count+w] = alpha*ei_conj(rhs(j2+w,k));
+        count += nr;
         ++h;
       }
       // normal
       for(int k=j2+nr; k<end_k; k++)
       {
-        ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*rhs(k,j2+0)));
-        ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*rhs(k,j2+1)));
+        blockB[count+0] = alpha*rhs(k,j2+0);
+        blockB[count+1] = alpha*rhs(k,j2+1);
         if (nr==4)
         {
-          ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*rhs(k,j2+2)));
-          ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*rhs(k,j2+3)));
+          blockB[count+2] = alpha*rhs(k,j2+2);
+          blockB[count+3] = alpha*rhs(k,j2+3);
         }
-        count += nr*PacketSize;
+        count += nr;
       }
     }
 
@@ -154,14 +154,14 @@ struct ei_symm_pack_rhs
     {
       for(int k=k2; k<end_k; k++)
       {
-        ei_pstore(&blockB[count+0*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+0,k))));
-        ei_pstore(&blockB[count+1*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+1,k))));
+        blockB[count+0] = alpha*ei_conj(rhs(j2+0,k));
+        blockB[count+1] = alpha*ei_conj(rhs(j2+1,k));
         if (nr==4)
         {
-          ei_pstore(&blockB[count+2*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+2,k))));
-          ei_pstore(&blockB[count+3*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+3,k))));
+          blockB[count+2] = alpha*ei_conj(rhs(j2+2,k));
+          blockB[count+3] = alpha*ei_conj(rhs(j2+3,k));
         }
-        count += nr*PacketSize;
+        count += nr;
       }
     }
 
@@ -172,14 +172,14 @@ struct ei_symm_pack_rhs
       int half = std::min(end_k,j2);
       for(int k=k2; k<half; k++)
       {
-        ei_pstore(&blockB[count], ei_pset1(alpha*ei_conj(rhs(j2,k))));
-        count += PacketSize;
+        blockB[count] = alpha*ei_conj(rhs(j2,k));
+        count += 1;
       }
       // normal
       for(int k=half; k<k2+rows; k++)
       {
-        ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
-        count += PacketSize;
+        blockB[count] = alpha*rhs(k,j2);
+        count += 1;
       }
     }
   }
@@ -244,7 +244,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
     int mc = std::min<int>(Blocking::Max_mc,rows);  // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
 
@@ -292,7 +294,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
@@ -323,7 +325,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
     int mc = std::min<int>(Blocking::Max_mc,rows);  // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
 
@@ -346,7 +350,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 37617a915..27c7caf17 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -120,7 +120,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
     int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+//     Scalar* allocatedBlockB = new Scalar[sizeB];
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
     triangularBuffer.setZero();
@@ -155,7 +158,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
 
           // => GEBP with the micro triangular block
           // The trick is to pack this micro block while filling the opposite triangular part with zeros.
-          // To this end we do an extra triangular copy to small temporary buffer
+          // To this end we do an extra triangular copy to a small temporary buffer
           for (int k=0;k<actualPanelWidth;++k)
           {
             if (!(Mode&UnitDiag))
@@ -166,7 +169,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
           pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth);
 
           gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
-                      actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
 
           // GEBP with remaining micro panel
           if (lengthTarget>0)
@@ -176,7 +179,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
             pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
 
             gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
-                        actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -196,7 +199,8 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+//     delete[] allocatedBlockB;
   }
 };
 
@@ -234,7 +238,9 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
     int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar,sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
     triangularBuffer.setZero();
@@ -252,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
       const int actual_kc = std::min(IsLower ? size-k2 : k2, kc);
       int actual_k2 = IsLower ? k2 : k2-actual_kc;
       int rs = IsLower ? actual_k2 : size - k2;
-      Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize;
+      Scalar* geb = blockB+actual_kc*actual_kc/**Blocking::PacketSize*/;
 
       pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
 
@@ -265,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
           int panelOffset = IsLower ? j2+actualPanelWidth : 0;
           int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
-          pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
+          pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
                          &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
@@ -279,7 +285,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
               triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
           }
 
-          pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
+          pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
                          triangularBuffer.data(), triangularBuffer.stride(), alpha,
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
@@ -300,10 +306,10 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
             int blockOffset = IsLower ? j2 : 0;
 
             gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
-                        blockA, blockB+j2*actual_kc*Blocking::PacketSize,
+                        blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset*Blocking::PacketSize);// offsets
+                        blockOffset, blockOffset);// offsets
           }
         }
         gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
@@ -312,7 +318,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index 23a645d7c..e32a9929c 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -67,7 +67,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
     int mc = std::min<int>(Blocking::Max_mc,size);   // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*cols*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_conj_if<Conjugate> conj;
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel;
@@ -146,7 +148,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
             pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
 
             gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols,
-                        actualPanelWidth, actual_kc, 0, blockBOffset*Blocking::PacketSize);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -169,7 +171,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*cols*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
@@ -198,7 +200,9 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
     int mc = std::min<int>(Blocking::Max_mc,size);   // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_conj_if<Conjugate> conj;
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<false,Conjugate> > gebp_kernel;
@@ -215,7 +219,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
 
       int startPanel = IsLower ? 0 : k2+actual_kc;
       int rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
-      Scalar* geb = blockB+actual_kc*actual_kc*Blocking::PacketSize;
+      Scalar* geb = blockB+actual_kc*actual_kc;
 
       if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, -1, actual_kc, rs);
 
@@ -230,7 +234,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
           int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
 
           if (panelLength>0)
-          pack_rhs_panel(blockB+j2*actual_kc*Blocking::PacketSize,
+          pack_rhs_panel(blockB+j2*actual_kc,
                          &rhs(actual_k2+panelOffset, actual_j2), triStride, -1,
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
@@ -260,10 +264,10 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
             if(panelLength>0)
             {
               gebp_kernel(&lhs(i2,absolute_j2), otherStride,
-                          blockA, blockB+j2*actual_kc*Blocking::PacketSize,
+                          blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset*Blocking::PacketSize); // offsets
+                          panelOffset, panelOffset); // offsets
             }
 
             // unblocked triangular solve
@@ -298,7 +302,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index dc1aa150b..ba92f2370 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -232,7 +232,7 @@ using Eigen::ei_cos;
 #endif
 
 #ifndef EIGEN_STACK_ALLOCATION_LIMIT
-#define EIGEN_STACK_ALLOCATION_LIMIT 1000000
+#define EIGEN_STACK_ALLOCATION_LIMIT 20000
 #endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index e99fc2970..ccc155dc5 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -22,8 +22,8 @@ void gemm(const M& a, const M& b, M& c)
 
 int main(int argc, char ** argv)
 {
-  int rep = 2;
-  int s = 1024;
+  int rep = 1;
+  int s = 2048;
   int m = s;
   int n = s;
   int p = s;
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
index a9dfaa66f..babf1ec2c 100644
--- a/bench/bench_gemm_blas.cpp
+++ b/bench/bench_gemm_blas.cpp
@@ -31,7 +31,22 @@ static int intone = 1;
 
 void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
 {
-//   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, c.rows(), c.cols(), a.cols(), 1, a.data(), a.rows(), b.data(), b.rows(), 1, c.data(), c.rows());
+  int M = c.rows();
+  int N = c.cols();
+  int K = a.cols();
+
+  int lda = a.rows();
+  int ldb = b.rows();
+  int ldc = c.rows();
+  
+  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+         const_cast<float*>(a.data()),&lda,
+         const_cast<float*>(b.data()),&ldb,&fone,
+         c.data(),&ldc);
+}
+
+void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
+{
   int M = c.rows();
   int N = c.cols();
   int K = a.cols();
@@ -40,16 +55,16 @@ void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
   int ldb = b.rows();
   int ldc = c.rows();
 
-  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
-         const_cast<float*>(a.data()),&lda,
-         const_cast<float*>(b.data()),&ldb,&fzero,
+  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
+         const_cast<double*>(a.data()),&lda,
+         const_cast<double*>(b.data()),&ldb,&done,
          c.data(),&ldc);
 }
 
 int main(int argc, char **argv)
 {
-  int rep = 2;
-  int s = 1024;
+  int rep = 1;
+  int s = 2048;
   int m = s;
   int n = s;
   int p = s;

From 801440c5192d36967906a3a9639cf2f3f3a61784 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 22 Feb 2010 09:32:16 +0100
Subject: [PATCH 012/122] fix BTL's eigen interface (transplanted from
 437f40acc1cbd9ce2f2a2a3f413cae3a5b35f8fb )

---
 bench/btl/libs/eigen2/eigen2_interface.hh | 27 ++++++++++-------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index 1166a37a1..a8b5b884f 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -17,11 +17,8 @@
 //
 #ifndef EIGEN2_INTERFACE_HH
 #define EIGEN2_INTERFACE_HH
-// #include <cblas.h>
-#include <Eigen/Array>
-#include <Eigen/Cholesky>
-#include <Eigen/LU>
-#include <Eigen/QR>
+
+#include <Eigen/Eigen>
 #include <vector>
 #include "btl.hh"
 
@@ -88,27 +85,27 @@ public :
   }
 
   static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
-    X = (A*B).lazy();
+    X.noalias() = A*B;
   }
 
   static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
-    X = (A.transpose()*B.transpose()).lazy();
+    X.noalias() = A.transpose()*B.transpose();
   }
 
   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A.transpose()*A).lazy();
+    X.noalias() = A.transpose()*A;
   }
 
   static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
-    X = (A*A.transpose()).lazy();
+    X.noalias() = A*A.transpose();
   }
 
   static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
-    X = (A*B).lazy();
+    X.noalias() = A*B;
   }
 
   static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
-    X = (A.template selfadjointView<LowerTriangular>() * B)/*.lazy()*/;
+    X.noalias() = (A.template selfadjointView<Lower>() * B);
 //     ei_product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
   }
 
@@ -173,7 +170,7 @@ public :
   }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
-    X = (A.transpose()*B).lazy();
+    X.noalias() = (A.transpose()*B);
   }
 
   static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
@@ -193,16 +190,16 @@ public :
   }
 
   static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
-    X = L.template triangularView<LowerTriangular>().solve(B);
+    X = L.template triangularView<Lower>().solve(B);
   }
 
   static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
-    X = L.template triangularView<LowerTriangular>().solve(B);
+    X = L.template triangularView<Lower>().solve(B);
   }
 
   static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
     C = X;
-    ei_llt_inplace<LowerTriangular>::blocked(C);
+    ei_llt_inplace<Lower>::blocked(C);
     //C = X.llt().matrixL();
 //     C = X;
 //     Cholesky<gene_matrix>::computeInPlace(C);

From 1fd8d7b96a4aac14fe829b214c6dc6d3c8d8d326 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 23 Feb 2010 11:35:51 +0100
Subject: [PATCH 013/122] Attempt to fix PGI compilation issue.

---
 Eigen/src/Core/util/Macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index dc1aa150b..37ccef047 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -211,7 +211,7 @@ using Eigen::ei_cos;
  */
 #if !EIGEN_ALIGN
   #define EIGEN_ALIGN_TO_BOUNDARY(n)
-#elif (defined __GNUC__)
+#elif (defined __GNUC__) || (defined __PGI)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #elif (defined _MSC_VER)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))

From 7dc75380c101b9b4f3882f78fe6a5e9ae8963cac Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 23 Feb 2010 09:04:59 -0500
Subject: [PATCH 014/122] * FullPivLU: replace "remaining==0" termination
 condition (from Golub) by a fuzzy compare   (fixes lu test failures when
 testing solve()) * LU test: set appropriate threshold and limit the number of
 times that a specially tricky test   is run. (fixes lu test failures when
 testing rank()). * Tests: rename createRandomMatrixOfRank to
 createRandomProjectionOfRank

---
 Eigen/src/LU/FullPivLU.h |  6 +++++-
 test/inverse.cpp         |  2 +-
 test/lu.cpp              | 31 ++++++++++++++++++++++++++-----
 test/main.h              |  4 ++--
 test/qr_colpivoting.cpp  |  4 ++--
 test/qr_fullpivoting.cpp |  2 +-
 6 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 9afc448cc..ec551645b 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -404,6 +404,7 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
+  RealScalar cutoff(0);
 
   for(int k = 0; k < size; ++k)
   {
@@ -418,8 +419,11 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
+    // when k==0, biggest_in_corner is the biggest coeff absolute value in the original matrix
+    if(k == 0) cutoff = biggest_in_corner * NumTraits<Scalar>::epsilon();
+
     // if the pivot (hence the corner) is exactly zero, terminate to avoid generating nan/inf values
-    if(biggest_in_corner == RealScalar(0))
+    if(ei_abs(biggest_in_corner) < cutoff)
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 713caf4a6..3f6138e0c 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -42,7 +42,7 @@ template<typename MatrixType> void inverse(const MatrixType& m)
              m2(rows, cols),
              mzero = MatrixType::Zero(rows, cols),
              identity = MatrixType::Identity(rows, rows);
-  createRandomMatrixOfRank(rows,rows,rows,m1);
+  createRandomProjectionOfRank(rows,rows,rows,m1);
   m2 = m1.inverse();
   VERIFY_IS_APPROX(m1, m2.inverse() );
 
diff --git a/test/lu.cpp b/test/lu.cpp
index 568db8230..02f6ec805 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -28,7 +28,11 @@ using namespace std;
 
 template<typename MatrixType> void lu_non_invertible()
 {
+  static int times_called = 0;
+  times_called++;
+  
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   /* this test covers the following files:
      LU.h
   */
@@ -64,9 +68,15 @@ template<typename MatrixType> void lu_non_invertible()
   
   MatrixType m1(rows, cols), m3(rows, cols2);
   CMatrixType m2(cols, cols2);
-  createRandomMatrixOfRank(rank, rows, cols, m1);
+  createRandomProjectionOfRank(rank, rows, cols, m1);
+
+  FullPivLU<MatrixType> lu;
+
+  // The special value 0.01 below works well in tests. Keep in mind that we're only computing the rank of projections.
+  // So it's not clear at all the epsilon should play any role there.
+  lu.setThreshold(RealScalar(0.01));
+  lu.compute(m1);
 
-  FullPivLU<MatrixType> lu(m1);
   // FIXME need better way to construct trapezoid matrices. extend triangularView to support rectangular.
   DynamicMatrixType u(rows,cols);
   for(int i = 0; i < rows; i++)
@@ -91,9 +101,20 @@ template<typename MatrixType> void lu_non_invertible()
   VERIFY(!lu.isSurjective());
   VERIFY((m1 * m1kernel).isMuchSmallerThan(m1));
   VERIFY(m1image.fullPivLu().rank() == rank);
-  DynamicMatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
-  sidebyside << m1, m1image;
-  VERIFY(sidebyside.fullPivLu().rank() == rank);
+
+  // The following test is damn hard to get to succeed over a large number of repetitions.
+  // We're checking that the image is indeed the image, i.e. adding it as new columns doesn't increase the rank.
+  // Since we've already tested rank() above, the point here is not to test rank(), it is to test image().
+  // Since image() is implemented in a very simple way that doesn't leave much room for choice, the occasional
+  // errors that we get here (one in 1e+4 repetitions roughly) are probably just a sign that it's a really
+  // hard test, so we just limit how many times it's run.
+  if(times_called < 100)
+  {
+    DynamicMatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
+    sidebyside << m1, m1image;
+    VERIFY(sidebyside.fullPivLu().rank() == rank);
+  }
+  
   m2 = CMatrixType::Random(cols,cols2);
   m3 = m1*m2;
   m2 = CMatrixType::Random(cols,cols2);
diff --git a/test/main.h b/test/main.h
index 64f70b394..6d296b2e3 100644
--- a/test/main.h
+++ b/test/main.h
@@ -148,7 +148,7 @@ namespace Eigen
 
 #define EIGEN_INTERNAL_DEBUGGING
 #define EIGEN_NICE_RANDOM
-#include <Eigen/QR> // required for createRandomMatrixOfRank
+#include <Eigen/QR> // required for createRandomProjectionOfRank
 
 
 #define VERIFY(a) do { if (!(a)) { \
@@ -343,7 +343,7 @@ inline bool test_isUnitary(const MatrixBase<Derived>& m)
 }
 
 template<typename MatrixType>
-void createRandomMatrixOfRank(int desired_rank, int rows, int cols, MatrixType& m)
+void createRandomProjectionOfRank(int desired_rank, int rows, int cols, MatrixType& m)
 {
   typedef typename ei_traits<MatrixType>::Scalar Scalar;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index 16eb27c52..abee32184 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -36,7 +36,7 @@ template<typename MatrixType> void qr()
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   MatrixType m1;
-  createRandomMatrixOfRank(rank,rows,cols,m1);
+  createRandomProjectionOfRank(rank,rows,cols,m1);
   ColPivHouseholderQR<MatrixType> qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
@@ -64,7 +64,7 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   typedef typename MatrixType::Scalar Scalar;
   int rank = ei_random<int>(1, std::min(int(Rows), int(Cols))-1);
   Matrix<Scalar,Rows,Cols> m1;
-  createRandomMatrixOfRank(rank,Rows,Cols,m1);
+  createRandomProjectionOfRank(rank,Rows,Cols,m1);
   ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(Cols - qr.rank() == qr.dimensionOfKernel());
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index c82ba4c7e..60255f94c 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -35,7 +35,7 @@ template<typename MatrixType> void qr()
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   MatrixType m1;
-  createRandomMatrixOfRank(rank,rows,cols,m1);
+  createRandomProjectionOfRank(rank,rows,cols,m1);
   FullPivHouseholderQR<MatrixType> qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(cols - qr.rank() == qr.dimensionOfKernel());

From 68eaefa5d4966d0130c64643b0554f05cec1ecf8 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 23 Feb 2010 18:23:12 +0100
Subject: [PATCH 015/122] update BTL (better timer, eigen2 => eigen3, etc)

---
 bench/btl/CMakeLists.txt                      |  2 +-
 bench/btl/data/perlib_plot_settings.txt       |  6 +-
 bench/btl/generic_bench/bench_parameter.hh    |  4 +-
 .../timers/portable_perf_analyzer.hh          | 29 ++++----
 .../generic_bench/timers/portable_timer.hh    | 60 ++++++++--------
 bench/btl/libs/eigen2/CMakeLists.txt          | 68 +++++++++----------
 6 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/bench/btl/CMakeLists.txt b/bench/btl/CMakeLists.txt
index af234624a..5855c0d9e 100644
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@@ -70,7 +70,7 @@ MACRO(BTL_ADD_BENCH targetname)
   IF(BUILD_${targetname})
     ADD_EXECUTABLE(${targetname} ${_sources})
     ADD_TEST(${targetname} "${targetname}")
-    target_link_libraries(${targetname} ${DEFAULT_LIBRARIES})
+    target_link_libraries(${targetname} ${DEFAULT_LIBRARIES} rt)
   ENDIF(BUILD_${targetname})
 
 ENDMACRO(BTL_ADD_BENCH)
diff --git a/bench/btl/data/perlib_plot_settings.txt b/bench/btl/data/perlib_plot_settings.txt
index d45f4ff98..4c253f450 100644
--- a/bench/btl/data/perlib_plot_settings.txt
+++ b/bench/btl/data/perlib_plot_settings.txt
@@ -1,6 +1,6 @@
-eigen2 ;          with lines lw 4 lt 1 lc rgbcolor "black"
-eigen2_novec ;    with lines lw 2 lt 1 lc rgbcolor "#999999"
-eigen2_nogccvec ; with lines lw 2 lt 2 lc rgbcolor "#991010"
+eigen3 ;          with lines lw 4 lt 1 lc rgbcolor "black"
+eigen3_novec ;    with lines lw 2 lt 1 lc rgbcolor "#999999"
+eigen3_nogccvec ; with lines lw 2 lt 2 lc rgbcolor "#991010"
 INTEL_MKL ;       with lines lw 3 lt 2 lc rgbcolor "#00b7ff"
 ATLAS ;           with lines lw 3 lt 1 lc rgbcolor "#52e657"
 gmm ;             with lines lw 3 lt 1 lc rgbcolor "#0000ff"
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index e9603e4fc..d14340037 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -23,7 +23,7 @@
 // minimal time for each measurement
 #define REAL_TYPE float
 // minimal time for each measurement
-#define MIN_TIME 0.5
+#define MIN_TIME 0.2
 // nb of point on bench curves
 #define NB_POINT 100
 // min vector size for axpy bench
@@ -48,6 +48,6 @@
 #define DEFAULT_NB_SAMPLE 1000
 
 // how many times we run a single bench (keep the best perf)
-#define NB_TRIES 5
+#define NB_TRIES 3
 
 #endif
diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
index 4298e61df..6b1f8e7d7 100644
--- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
+++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
@@ -27,41 +27,41 @@
 template <class Action>
 class Portable_Perf_Analyzer{
 public:
-  Portable_Perf_Analyzer( void ):_nb_calc(1),_chronos(){
+  Portable_Perf_Analyzer( ):_nb_calc(0), m_time_action(0), _chronos(){
     MESSAGE("Portable_Perf_Analyzer Ctor");
   };
   Portable_Perf_Analyzer( const Portable_Perf_Analyzer & ){
     INFOS("Copy Ctor not implemented");
     exit(0);
   };
-  ~Portable_Perf_Analyzer( void ){
+  ~Portable_Perf_Analyzer(){
     MESSAGE("Portable_Perf_Analyzer Dtor");
   };
 
-  BTL_DONT_INLINE  double eval_mflops(int size)
+  BTL_DONT_INLINE double eval_mflops(int size)
   {
     Action action(size);
 
-    double time_action = 0;
-    action.initialize();
-    time_action = time_calculate(action);
-    while (time_action < MIN_TIME)
+//     action.initialize();
+//     time_action = time_calculate(action);
+    while (m_time_action < MIN_TIME)
     {
-      _nb_calc *= 2;
+      if(_nb_calc==0) _nb_calc = 1;
+      else            _nb_calc *= 2;
       action.initialize();
-      time_action = time_calculate(action);
+      m_time_action = time_calculate(action);
     }
 
     // optimize
     for (int i=1; i<NB_TRIES; ++i)
     {
       Action _action(size);
-      std::cout << " " << _action.nb_op_base()*_nb_calc/(time_action*1e6) << " ";
+      std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action*1e6) << " ";
       _action.initialize();
-      time_action = std::min(time_action, time_calculate(_action));
+      m_time_action = std::min(m_time_action, time_calculate(_action));
     }
 
-    time_action = time_action / (double(_nb_calc));
+    double time_action = m_time_action / (double(_nb_calc));
 
     // check
     if (BtlConfig::Instance.checkResults && size<128)
@@ -70,7 +70,7 @@ public:
       action.calculate();
       action.check_result();
     }
-    return action.nb_op_base()/(time_action*1000000.0);
+    return action.nb_op_base()/(time_action*1e6);
   }
 
   BTL_DONT_INLINE double time_calculate(Action & action)
@@ -86,7 +86,7 @@ public:
     return _chronos.user_time();
   }
 
-  unsigned long long get_nb_calc( void )
+  unsigned long long get_nb_calc()
   {
     return _nb_calc;
   }
@@ -94,6 +94,7 @@ public:
 
 private:
   unsigned long long _nb_calc;
+  double m_time_action;
   Portable_Timer _chronos;
 
 };
diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh
index 30fc4e883..42528d113 100755
--- a/bench/btl/generic_bench/timers/portable_timer.hh
+++ b/bench/btl/generic_bench/timers/portable_timer.hh
@@ -75,7 +75,7 @@
 
 
  private:
-   
+
    double LIToSecs(LARGE_INTEGER& L) {
      return ((double)L.QuadPart /(double)frequency.QuadPart) ;
    }
@@ -98,34 +98,37 @@ class Portable_Timer
 {
  public:
 
-  Portable_Timer( void ):_utime_sec_start(-1),
-		_utime_usec_start(-1),
-		_utime_sec_stop(-1),
-		_utime_usec_stop(-1)/*,
-        m_prev_cs(-1)*/
+  Portable_Timer( void )
+//   :_utime_sec_start(-1),
+// 		_utime_usec_start(-1),
+// 		_utime_sec_stop(-1),
+// 		_utime_usec_stop(-1)/*,
+//         m_prev_cs(-1)*/
   {
   }
 
 
   void   start()
   {
-    int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
-//     _start_time = std::clock();
-    _utime_sec_start  =  resourcesUsage.ru_utime.tv_sec ;
-    _utime_usec_start =  resourcesUsage.ru_utime.tv_usec ;
-//     m_prev_cs = resourcesUsage.ru_nivcsw;
+//     int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
+//     _utime_sec_start  =  resourcesUsage.ru_utime.tv_sec ;
+//     _utime_usec_start =  resourcesUsage.ru_utime.tv_usec ;
+
+    timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    m_start_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
 
   }
 
   void stop()
   {
-    int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
-//     _stop_time = std::clock();
-    _utime_sec_stop  =  resourcesUsage.ru_utime.tv_sec ;
-    _utime_usec_stop =  resourcesUsage.ru_utime.tv_usec ;
+//     int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
+//     _utime_sec_stop  =  resourcesUsage.ru_utime.tv_sec ;
+//     _utime_usec_stop =  resourcesUsage.ru_utime.tv_usec ;
 
-//     m_prev_cs = resourcesUsage.ru_nivcsw - m_prev_cs;
-//     std::cerr << resourcesUsage.ru_nvcsw << " + " << resourcesUsage.ru_nivcsw << "\n";
+    timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    m_stop_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
 
   }
 
@@ -137,26 +140,29 @@ class Portable_Timer
   double user_time()
   {
 //     std::cout << m_prev_cs << "\n";
-    long tot_utime_sec=_utime_sec_stop-_utime_sec_start;
-    long tot_utime_usec=_utime_usec_stop-_utime_usec_start;
-    return double(tot_utime_sec)+ double(tot_utime_usec)/double(USEC_IN_SEC) ;
+//     long tot_utime_sec=_utime_sec_stop-_utime_sec_start;
+//     long tot_utime_usec=_utime_usec_stop-_utime_usec_start;
+//     return double(tot_utime_sec)+ double(tot_utime_usec)/double(USEC_IN_SEC) ;
+    return m_stop_time - m_start_time;
   }
 
 
 private:
 
-  struct rusage resourcesUsage ;
+//   struct rusage resourcesUsage ;
 
-  long _utime_sec_start ;
-  long _utime_usec_start ;
+//   long _utime_sec_start ;
+//   long _utime_usec_start ;
 
-  long _utime_sec_stop ;
-  long _utime_usec_stop ;
+//   long _utime_sec_stop ;
+//   long _utime_usec_stop ;
 
 //   long m_prev_cs;
 
-  std::clock_t _start_time;
-  std::clock_t _stop_time;
+//   std::clock_t _start_time;
+//   std::clock_t _stop_time;
+
+  double m_stop_time, m_start_time;
 
 }; // Portable_Timer
 
diff --git a/bench/btl/libs/eigen2/CMakeLists.txt b/bench/btl/libs/eigen2/CMakeLists.txt
index 177247077..2d2f74ee2 100644
--- a/bench/btl/libs/eigen2/CMakeLists.txt
+++ b/bench/btl/libs/eigen2/CMakeLists.txt
@@ -1,57 +1,57 @@
-# find_package(MKL)
+
 find_package(Eigen2)
 if (EIGEN2_FOUND)
 
   include_directories(${EIGEN2_INCLUDE_DIR})
-  btl_add_bench(btl_eigen2_linear main_linear.cpp)
-  btl_add_bench(btl_eigen2_vecmat main_vecmat.cpp)
-  btl_add_bench(btl_eigen2_matmat main_matmat.cpp)
-  btl_add_bench(btl_eigen2_adv main_adv.cpp      )
+  btl_add_bench(btl_eigen3_linear main_linear.cpp)
+  btl_add_bench(btl_eigen3_vecmat main_vecmat.cpp)
+  btl_add_bench(btl_eigen3_matmat main_matmat.cpp)
+  btl_add_bench(btl_eigen3_adv main_adv.cpp      )
 
-  btl_add_target_property(btl_eigen2_linear COMPILE_FLAGS "-DBTL_PREFIX=eigen2")
-  btl_add_target_property(btl_eigen2_vecmat COMPILE_FLAGS "-DBTL_PREFIX=eigen2")
-  btl_add_target_property(btl_eigen2_matmat COMPILE_FLAGS "-DBTL_PREFIX=eigen2")
-  btl_add_target_property(btl_eigen2_adv    COMPILE_FLAGS "-DBTL_PREFIX=eigen2")
+  btl_add_target_property(btl_eigen3_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
+  btl_add_target_property(btl_eigen3_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
+  btl_add_target_property(btl_eigen3_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
+  btl_add_target_property(btl_eigen3_adv    COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=eigen3")
 
   option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
   if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
-    btl_add_bench(btl_eigen2_nogccvec_linear main_linear.cpp)
-    btl_add_bench(btl_eigen2_nogccvec_vecmat main_vecmat.cpp)
-    btl_add_bench(btl_eigen2_nogccvec_matmat main_matmat.cpp)
-    btl_add_bench(btl_eigen2_nogccvec_adv    main_adv.cpp   )
+    btl_add_bench(btl_eigen3_nogccvec_linear main_linear.cpp)
+    btl_add_bench(btl_eigen3_nogccvec_vecmat main_vecmat.cpp)
+    btl_add_bench(btl_eigen3_nogccvec_matmat main_matmat.cpp)
+    btl_add_bench(btl_eigen3_nogccvec_adv    main_adv.cpp   )
 
-    btl_add_target_property(btl_eigen2_nogccvec_linear COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec")
-    btl_add_target_property(btl_eigen2_nogccvec_vecmat COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec")
-    btl_add_target_property(btl_eigen2_nogccvec_matmat COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec")
-    btl_add_target_property(btl_eigen2_nogccvec_adv    COMPILE_FLAGS "-fno-tree-vectorize -DBTL_PREFIX=eigen2_nogccvec")
+    btl_add_target_property(btl_eigen3_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
+    btl_add_target_property(btl_eigen3_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
+    btl_add_target_property(btl_eigen3_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
+    btl_add_target_property(btl_eigen3_nogccvec_adv    COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=eigen3_nogccvec")
   endif()
 
 
   if(NOT BTL_NOVEC)
-    btl_add_bench(btl_eigen2_novec_linear main_linear.cpp)
-    btl_add_bench(btl_eigen2_novec_vecmat main_vecmat.cpp)
-    btl_add_bench(btl_eigen2_novec_matmat main_matmat.cpp)
-    btl_add_bench(btl_eigen2_novec_adv main_adv.cpp      )
-    btl_add_target_property(btl_eigen2_novec_linear COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec")
-    btl_add_target_property(btl_eigen2_novec_vecmat COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec")
-    btl_add_target_property(btl_eigen2_novec_matmat COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec")
-    btl_add_target_property(btl_eigen2_novec_adv    COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_novec")
+    btl_add_bench(btl_eigen3_novec_linear main_linear.cpp)
+    btl_add_bench(btl_eigen3_novec_vecmat main_vecmat.cpp)
+    btl_add_bench(btl_eigen3_novec_matmat main_matmat.cpp)
+    btl_add_bench(btl_eigen3_novec_adv main_adv.cpp      )
+    btl_add_target_property(btl_eigen3_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
+    btl_add_target_property(btl_eigen3_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
+    btl_add_target_property(btl_eigen3_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
+    btl_add_target_property(btl_eigen3_novec_adv    COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_novec")
 
-#     if(BUILD_btl_eigen2_adv)
-#       target_link_libraries(btl_eigen2_adv ${MKL_LIBRARIES})
-#     endif(BUILD_btl_eigen2_adv)
+#     if(BUILD_btl_eigen3_adv)
+#       target_link_libraries(btl_eigen3_adv ${MKL_LIBRARIES})
+#     endif(BUILD_btl_eigen3_adv)
 
   endif(NOT BTL_NOVEC)
 
-  btl_add_bench(btl_tiny_eigen2 btl_tiny_eigen2.cpp OFF)
+  btl_add_bench(btl_tiny_eigen3 btl_tiny_eigen3.cpp OFF)
 
   if(NOT BTL_NOVEC)
-    btl_add_bench(btl_tiny_eigen2_novec btl_tiny_eigen2.cpp OFF)
-    btl_add_target_property(btl_tiny_eigen2_novec    COMPILE_FLAGS "-DBTL_PREFIX=eigen2_tiny")
+    btl_add_bench(btl_tiny_eigen3_novec btl_tiny_eigen3.cpp OFF)
+    btl_add_target_property(btl_tiny_eigen3_novec    COMPILE_FLAGS "-DBTL_PREFIX=eigen3_tiny")
 
-    if(BUILD_btl_tiny_eigen2_novec)
-      btl_add_target_property(btl_tiny_eigen2_novec    COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen2_tiny_novec")
-    endif(BUILD_btl_tiny_eigen2_novec)
+    if(BUILD_btl_tiny_eigen3_novec)
+      btl_add_target_property(btl_tiny_eigen3_novec    COMPILE_FLAGS "-DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=eigen3_tiny_novec")
+    endif(BUILD_btl_tiny_eigen3_novec)
   endif(NOT BTL_NOVEC)
 
 endif (EIGEN2_FOUND)

From 022e2f5ef4513750a5b2960ba0e8e825d8f640bb Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 23 Feb 2010 18:24:15 +0100
Subject: [PATCH 016/122] fix typo

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++---
 Eigen/src/Core/products/GeneralMatrixMatrix.h     | 2 +-
 Eigen/src/Core/util/Memory.h                      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index dfc92c346..c29e4efc2 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -50,7 +50,7 @@ struct ei_gebp_kernel
     const int peeled_kc = (depth/4)*4;
 
     Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
-    
+
     // loops on each micro vertical panel of rhs (depth x nr)
     for(int j2=0; j2<packet_cols; j2+=nr)
     {
@@ -193,7 +193,7 @@ struct ei_gebp_kernel
           {
             PacketType B0, B1, B2, B3, A0, A1;
             PacketType T0;
-            
+
             A0 = ei_pload(&blA[0*PacketSize]);
             A1 = ei_pload(&blA[1*PacketSize]);
             B0 = ei_pload(&blB[0*PacketSize]);
@@ -516,7 +516,7 @@ struct ei_gebp_kernel
         for(int k=0; k<depth; k++)
           ei_pstore(&unpackedB[k*PacketSize], ei_pset1(blB[k]));
       }
-      
+
       for(int i=0; i<peeled_mc; i+=mr)
       {
         const Scalar* blA = &blockA[i*strideA+offsetA*mr];
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 25c8d4c96..d4f1f1913 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -78,7 +78,7 @@ static void run(int rows, int cols, int depth,
   int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
   int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
-  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc*8);
+  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
   std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
   Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
   Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index d4920d213..c7b95d334 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -232,7 +232,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   enum { PacketSize = ei_packet_traits<Scalar>::size,
          PacketAlignedMask = PacketSize-1
   };
-  
+
   if(PacketSize==1)
   {
     // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements

From a1e110332829a4bb38ca8e55608a2b048876018e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 23 Feb 2010 21:40:15 +0100
Subject: [PATCH 017/122] add a 2D parallelizer

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 +++++---
 Eigen/src/Core/products/Parallelizer.h        | 40 +++++++++++++++++++
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index d4f1f1913..84429a0d9 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -109,6 +109,8 @@ static void run(int rows, int cols, int depth,
       // Everything is packed, we can now call the block * panel kernel:
       ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
         (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+
+//         sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride);
     }
   }
 
@@ -137,12 +139,14 @@ struct ei_gemm_functor
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
   {}
 
-  void operator() (int start, int size) const
+  void operator() (int col, int cols, int row=0, int rows=-1) const
   {
-    Gemm::run(m_lhs.rows(), size, m_lhs.cols(),
-              (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(0,0)), m_lhs.stride(),
-              (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,start)), m_rhs.stride(),
-              (Scalar*)&(m_dest.coeffRef(0,start)), m_dest.stride(),
+    if(rows==-1)
+      rows = m_lhs.rows();
+    Gemm::run(rows, cols, m_lhs.cols(),
+              (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
+              (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
               m_actualAlpha);
   }
 
@@ -187,7 +191,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         _ActualRhsType,
         Dest> Functor;
 
-      ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols());
+      ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index d555508b2..088e387f9 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -47,4 +47,44 @@ void ei_run_parallel_1d(const Functor& func, int size)
 #endif
 }
 
+template<bool Parallelize,typename Functor>
+void ei_run_parallel_2d(const Functor& func, int size1, int size2)
+{
+#ifndef EIGEN_HAS_OPENMP
+  func(0,size1, 0,size2);
+#else
+  if(!Parallelize)
+    return func(0,size1, 0,size2);
+
+                                // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5, 11,  4, 13,  7,  5, 4};
+  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2,  1,  3,  1,  2,  3, 4};
+
+  int threads = omp_get_num_procs();
+  ei_assert(threads<=16 && "too many threads !");
+  int blockSize1 = size1 / divide1[threads];
+  int blockSize2 = size2 / divide2[threads];
+  
+  Matrix<int,4,Dynamic> ranges(4,threads);
+  int k = 0;
+  for(int i1=0; i1<divide1[threads]; ++i1)
+  {
+    int blockStart1 = i1*blockSize1;
+    int actualBlockSize1 = std::min(blockSize1, size1 - blockStart1);
+    for(int i2=0; i2<divide2[threads]; ++i2)
+    {
+      int blockStart2 = i2*blockSize2;
+      int actualBlockSize2 = std::min(blockSize2, size2 - blockStart2);
+      ranges.col(k++) << blockStart1, actualBlockSize1, blockStart2, actualBlockSize2;
+    }
+  }
+  
+  #pragma omp parallel for schedule(static,1)
+  for(int i=0; i<threads; ++i)
+  {
+    func(ranges.col(i)[0],ranges.col(i)[1],ranges.col(i)[2],ranges.col(i)[3]);
+  }
+#endif
+}
+
 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H

From d92df336ad21c7f8e0289f8ac3084b6313a17fe4 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 23 Feb 2010 15:40:24 -0500
Subject: [PATCH 018/122] Further LU test improvements. I'm not aware of any
 test failures anymore, not even with huge numbers of repetitions. Finally the
 createRandomMatrixOfRank() function is renamed to createRandomPIMatrixOfRank,
 where PI stands for 'partial isometry', that is, a matrix whose singular
 values are 0 or 1.

---
 Eigen/src/LU/FullPivLU.h |  7 +++--
 test/inverse.cpp         |  2 +-
 test/lu.cpp              | 65 +++++++++++++++-------------------------
 test/main.h              | 12 ++++++--
 test/qr_colpivoting.cpp  |  4 +--
 test/qr_fullpivoting.cpp |  2 +-
 6 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index ec551645b..0a305d52b 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -422,8 +422,11 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
     // when k==0, biggest_in_corner is the biggest coeff absolute value in the original matrix
     if(k == 0) cutoff = biggest_in_corner * NumTraits<Scalar>::epsilon();
 
-    // if the pivot (hence the corner) is exactly zero, terminate to avoid generating nan/inf values
-    if(ei_abs(biggest_in_corner) < cutoff)
+    // if the pivot (hence the corner) is "zero", terminate to avoid generating nan/inf values.
+    // Notice that using an exact comparison (biggest_in_corner==0) here, as Golub-van Loan do in
+    // their pseudo-code, results in numerical instability! The cutoff here has been validated
+    // by running the unit test 'lu' with many repetitions.
+    if(biggest_in_corner < cutoff)
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 3f6138e0c..1e567ad14 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -42,7 +42,7 @@ template<typename MatrixType> void inverse(const MatrixType& m)
              m2(rows, cols),
              mzero = MatrixType::Zero(rows, cols),
              identity = MatrixType::Identity(rows, rows);
-  createRandomProjectionOfRank(rows,rows,rows,m1);
+  createRandomPIMatrixOfRank(rows,rows,rows,m1);
   m2 = m1.inverse();
   VERIFY_IS_APPROX(m1, m2.inverse() );
 
diff --git a/test/lu.cpp b/test/lu.cpp
index 02f6ec805..442202a33 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -28,9 +28,6 @@ using namespace std;
 
 template<typename MatrixType> void lu_non_invertible()
 {
-  static int times_called = 0;
-  times_called++;
-  
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   /* this test covers the following files:
@@ -55,11 +52,16 @@ template<typename MatrixType> void lu_non_invertible()
     cols2 = cols = MatrixType::ColsAtCompileTime;
   }
 
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime
+  };
   typedef typename ei_kernel_retval_base<FullPivLU<MatrixType> >::ReturnType KernelMatrixType;
   typedef typename ei_image_retval_base<FullPivLU<MatrixType> >::ReturnType ImageMatrixType;
-  typedef Matrix<typename MatrixType::Scalar, Dynamic, Dynamic> DynamicMatrixType;
-  typedef Matrix<typename MatrixType::Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime>
+  typedef Matrix<typename MatrixType::Scalar, ColsAtCompileTime, ColsAtCompileTime>
           CMatrixType;
+  typedef Matrix<typename MatrixType::Scalar, RowsAtCompileTime, RowsAtCompileTime>
+          RMatrixType;
 
   int rank = ei_random<int>(1, std::min(rows, cols)-1);
 
@@ -68,26 +70,21 @@ template<typename MatrixType> void lu_non_invertible()
   
   MatrixType m1(rows, cols), m3(rows, cols2);
   CMatrixType m2(cols, cols2);
-  createRandomProjectionOfRank(rank, rows, cols, m1);
+  createRandomPIMatrixOfRank(rank, rows, cols, m1);
 
   FullPivLU<MatrixType> lu;
 
-  // The special value 0.01 below works well in tests. Keep in mind that we're only computing the rank of projections.
-  // So it's not clear at all the epsilon should play any role there.
+  // The special value 0.01 below works well in tests. Keep in mind that we're only computing the rank
+  // of singular values are either 0 or 1.
+  // So it's not clear at all that the epsilon should play any role there.
   lu.setThreshold(RealScalar(0.01));
   lu.compute(m1);
 
-  // FIXME need better way to construct trapezoid matrices. extend triangularView to support rectangular.
-  DynamicMatrixType u(rows,cols);
-  for(int i = 0; i < rows; i++)
-    for(int j = 0; j < cols; j++)
-      u(i,j) = i>j ? Scalar(0) : lu.matrixLU()(i,j);
-  DynamicMatrixType l(rows,rows);
-  for(int i = 0; i < rows; i++)
-    for(int j = 0; j < rows; j++)
-      l(i,j) = (i<j || j>=cols) ? Scalar(0)
-             : i==j ? Scalar(1)
-             : lu.matrixLU()(i,j);
+  MatrixType u(rows,cols);
+  u = lu.matrixLU().template triangularView<Upper>();
+  RMatrixType l = RMatrixType::Identity(rows,rows);
+  l.block(0,0,rows,std::min(rows,cols)).template triangularView<StrictlyLower>()
+    = lu.matrixLU().block(0,0,rows,std::min(rows,cols));
   
   VERIFY_IS_APPROX(lu.permutationP() * m1 * lu.permutationQ(), l*u);
   
@@ -101,20 +98,8 @@ template<typename MatrixType> void lu_non_invertible()
   VERIFY(!lu.isSurjective());
   VERIFY((m1 * m1kernel).isMuchSmallerThan(m1));
   VERIFY(m1image.fullPivLu().rank() == rank);
+  VERIFY_IS_APPROX(m1 * m1.adjoint() * m1image, m1image);
 
-  // The following test is damn hard to get to succeed over a large number of repetitions.
-  // We're checking that the image is indeed the image, i.e. adding it as new columns doesn't increase the rank.
-  // Since we've already tested rank() above, the point here is not to test rank(), it is to test image().
-  // Since image() is implemented in a very simple way that doesn't leave much room for choice, the occasional
-  // errors that we get here (one in 1e+4 repetitions roughly) are probably just a sign that it's a really
-  // hard test, so we just limit how many times it's run.
-  if(times_called < 100)
-  {
-    DynamicMatrixType sidebyside(m1.rows(), m1.cols() + m1image.cols());
-    sidebyside << m1, m1image;
-    VERIFY(sidebyside.fullPivLu().rank() == rank);
-  }
-  
   m2 = CMatrixType::Random(cols,cols2);
   m3 = m1*m2;
   m2 = CMatrixType::Random(cols,cols2);
@@ -128,20 +113,18 @@ template<typename MatrixType> void lu_invertible()
   /* this test covers the following files:
      LU.h
   */
+  typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
   int size = ei_random<int>(1,200);
 
   MatrixType m1(size, size), m2(size, size), m3(size, size);
-  m1 = MatrixType::Random(size,size);
+  FullPivLU<MatrixType> lu;
+  lu.setThreshold(0.01);
+  do {
+    m1 = MatrixType::Random(size,size);
+    lu.compute(m1);
+  } while(!lu.isInvertible());
 
-  if (ei_is_same_type<RealScalar,float>::ret)
-  {
-    // let's build a matrix more stable to inverse
-    MatrixType a = MatrixType::Random(size,size*2);
-    m1 += a * a.adjoint();
-  }
-
-  FullPivLU<MatrixType> lu(m1);
   VERIFY(0 == lu.dimensionOfKernel());
   VERIFY(lu.kernel().cols() == 1); // the kernel() should consist of a single (zero) column vector
   VERIFY(size == lu.rank());
diff --git a/test/main.h b/test/main.h
index 6d296b2e3..96324de33 100644
--- a/test/main.h
+++ b/test/main.h
@@ -148,7 +148,7 @@ namespace Eigen
 
 #define EIGEN_INTERNAL_DEBUGGING
 #define EIGEN_NICE_RANDOM
-#include <Eigen/QR> // required for createRandomProjectionOfRank
+#include <Eigen/QR> // required for createRandomPIMatrixOfRank
 
 
 #define VERIFY(a) do { if (!(a)) { \
@@ -342,8 +342,13 @@ inline bool test_isUnitary(const MatrixBase<Derived>& m)
   return m.isUnitary(test_precision<typename ei_traits<Derived>::Scalar>());
 }
 
+/** Creates a random Partial Isometry matrix of given rank.
+  *
+  * A partial isometry is a matrix all of whose singular values are either 0 or 1.
+  * This is very useful to test rank-revealing algorithms.
+  */
 template<typename MatrixType>
-void createRandomProjectionOfRank(int desired_rank, int rows, int cols, MatrixType& m)
+void createRandomPIMatrixOfRank(int desired_rank, int rows, int cols, MatrixType& m)
 {
   typedef typename ei_traits<MatrixType>::Scalar Scalar;
   enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime };
@@ -360,7 +365,8 @@ void createRandomProjectionOfRank(int desired_rank, int rows, int cols, MatrixTy
 
   if(desired_rank == 1)
   {
-    m = VectorType::Random(rows) * VectorType::Random(cols).transpose();
+    // here we normalize the vectors to get a partial isometry
+    m = VectorType::Random(rows).normalized() * VectorType::Random(cols).normalized().transpose();
     return;
   }
 
diff --git a/test/qr_colpivoting.cpp b/test/qr_colpivoting.cpp
index abee32184..96cc66316 100644
--- a/test/qr_colpivoting.cpp
+++ b/test/qr_colpivoting.cpp
@@ -36,7 +36,7 @@ template<typename MatrixType> void qr()
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   MatrixType m1;
-  createRandomProjectionOfRank(rank,rows,cols,m1);
+  createRandomPIMatrixOfRank(rank,rows,cols,m1);
   ColPivHouseholderQR<MatrixType> qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(cols - qr.rank() == qr.dimensionOfKernel());
@@ -64,7 +64,7 @@ template<typename MatrixType, int Cols2> void qr_fixedsize()
   typedef typename MatrixType::Scalar Scalar;
   int rank = ei_random<int>(1, std::min(int(Rows), int(Cols))-1);
   Matrix<Scalar,Rows,Cols> m1;
-  createRandomProjectionOfRank(rank,Rows,Cols,m1);
+  createRandomPIMatrixOfRank(rank,Rows,Cols,m1);
   ColPivHouseholderQR<Matrix<Scalar,Rows,Cols> > qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(Cols - qr.rank() == qr.dimensionOfKernel());
diff --git a/test/qr_fullpivoting.cpp b/test/qr_fullpivoting.cpp
index 60255f94c..7ad3af1fe 100644
--- a/test/qr_fullpivoting.cpp
+++ b/test/qr_fullpivoting.cpp
@@ -35,7 +35,7 @@ template<typename MatrixType> void qr()
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> MatrixQType;
   typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
   MatrixType m1;
-  createRandomProjectionOfRank(rank,rows,cols,m1);
+  createRandomPIMatrixOfRank(rank,rows,cols,m1);
   FullPivHouseholderQR<MatrixType> qr(m1);
   VERIFY_IS_APPROX(rank, qr.rank());
   VERIFY(cols - qr.rank() == qr.dimensionOfKernel());

From 3d066f4bc73fad712061d8b50d147d10988f07ff Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 23 Feb 2010 16:05:37 -0500
Subject: [PATCH 019/122] LDLT: * fix bug thanks to Ben Goodrich: we were
 terminating at the wrong place, leaving some matrix coefficients with wrong
 values. * don't use Higham's formula here: we're not trying to be
 rank-revealing.

---
 Eigen/src/Cholesky/LDLT.h | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 4d3149d42..708b02375 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -202,11 +202,8 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
     {
       // The biggest overall is the point of reference to which further diagonals
       // are compared; if any diagonal is negligible compared
-      // to the largest overall, the algorithm bails.  This cutoff is suggested
-      // in "Analysis of the Cholesky Decomposition of a Semi-definite Matrix" by
-      // Nicholas J. Higham. Also see "Accuracy and Stability of Numerical
-      // Algorithms" page 217, also by Higham.
-      cutoff = ei_abs(NumTraits<Scalar>::epsilon() * RealScalar(size) * biggest_in_corner);
+      // to the largest overall, the algorithm bails.
+      cutoff = ei_abs(NumTraits<Scalar>::epsilon() * biggest_in_corner);
 
       m_sign = ei_real(m_matrix.diagonal().coeff(index_of_biggest_in_corner)) > 0 ? 1 : -1;
     }
@@ -235,13 +232,6 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
                                                .dot(m_matrix.col(j).head(j)));
     m_matrix.coeffRef(j,j) = Djj;
 
-    // Finish early if the matrix is not full rank.
-    if(ei_abs(Djj) < cutoff)
-    {
-      for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
-      break;
-    }
-
     int endSize = size - j - 1;
     if (endSize > 0) {
       _temporary.tail(endSize).noalias() = m_matrix.block(j+1,0, endSize, j)
@@ -250,6 +240,13 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
       m_matrix.row(j).tail(endSize) = m_matrix.row(j).tail(endSize).conjugate()
                                    - _temporary.tail(endSize).transpose();
 
+      // Finish early if the matrix is not full rank.
+      if(ei_abs(Djj) < cutoff)
+      {
+        for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
+        break;
+      }
+
       m_matrix.col(j).tail(endSize) = m_matrix.row(j).tail(endSize) / Djj;
     }
   }

From 60325b83309df5061fb9230af4d7edb59d0eaf1b Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 23 Feb 2010 16:10:26 -0500
Subject: [PATCH 020/122] actually, this is not even meant to be a termination
 criterion. so the proper fix is this.

---
 Eigen/src/Cholesky/LDLT.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 708b02375..7c8e1eb04 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -240,14 +240,10 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
       m_matrix.row(j).tail(endSize) = m_matrix.row(j).tail(endSize).conjugate()
                                    - _temporary.tail(endSize).transpose();
 
-      // Finish early if the matrix is not full rank.
-      if(ei_abs(Djj) < cutoff)
+      if(ei_abs(Djj) > cutoff)
       {
-        for(int i = j; i < size; i++) m_transpositions.coeffRef(i) = i;
-        break;
+        m_matrix.col(j).tail(endSize) = m_matrix.row(j).tail(endSize) / Djj;
       }
-
-      m_matrix.col(j).tail(endSize) = m_matrix.row(j).tail(endSize) / Djj;
     }
   }
 

From f7aa9873caab20d49afd622b5ef76ecff8bfef06 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 24 Feb 2010 10:40:16 +0100
Subject: [PATCH 021/122] * fix LDLT's default ctor use * add a
 reconstructedMatrix() function to LDLT for debug purpose

---
 Eigen/src/Cholesky/LDLT.h | 55 ++++++++++++++++++++++++++++++++-------
 Eigen/src/Cholesky/LLT.h  |  2 +-
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 7c8e1eb04..8cfc256bb 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -62,14 +62,21 @@ template<typename _MatrixType> class LDLT
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef Matrix<int, 1, MatrixType::RowsAtCompileTime> IntRowVectorType;
 
-    /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via LDLT::compute(const MatrixType&).
-    */
+    /** \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via LDLT::compute(const MatrixType&).
+      */
     LDLT() : m_matrix(), m_p(), m_transpositions(), m_isInitialized(false) {}
 
+    /** \brief Default Constructor with memory preallocation
+      *
+      * Like the default constructor but with preallocation of the internal data
+      * according to the specified problem \a size.
+      * \sa LDLT()
+      */
+    LDLT(int size) : m_matrix(size,size), m_p(size), m_transpositions(size), m_isInitialized(false) {}
+
     LDLT(const MatrixType& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_p(matrix.rows()),
@@ -148,6 +155,8 @@ template<typename _MatrixType> class LDLT
       return m_matrix;
     }
 
+    const MatrixType reconstructedMatrix() const;
+
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
 
@@ -175,6 +184,10 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
 
   m_matrix = a;
 
+  m_p.resize(size);
+  m_transpositions.resize(size);
+  m_isInitialized = false;
+
   if (size <= 1) {
     m_p.setZero();
     m_transpositions.setZero();
@@ -228,8 +241,7 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
       continue;
     }
 
-    RealScalar Djj = ei_real(m_matrix.coeff(j,j) -  m_matrix.row(j).head(j)
-                                               .dot(m_matrix.col(j).head(j)));
+    RealScalar Djj = ei_real(m_matrix.coeff(j,j) -  m_matrix.row(j).head(j).dot(m_matrix.col(j).head(j)));
     m_matrix.coeffRef(j,j) = Djj;
 
     int endSize = size - j - 1;
@@ -238,7 +250,7 @@ LDLT<MatrixType>& LDLT<MatrixType>::compute(const MatrixType& a)
                                 * m_matrix.col(j).head(j).conjugate();
 
       m_matrix.row(j).tail(endSize) = m_matrix.row(j).tail(endSize).conjugate()
-                                   - _temporary.tail(endSize).transpose();
+                                    - _temporary.tail(endSize).transpose();
 
       if(ei_abs(Djj) > cutoff)
       {
@@ -308,6 +320,31 @@ bool LDLT<MatrixType>::solveInPlace(MatrixBase<Derived> &bAndX) const
   return true;
 }
 
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: P^T L D L^* P.
+ * This function is provided for debug purpose. */
+template<typename MatrixType>
+const MatrixType LDLT<MatrixType>::reconstructedMatrix() const
+{
+  ei_assert(m_isInitialized && "LDLT is not initialized.");
+  const int size = m_matrix.rows();
+  MatrixType res(size,size);
+  res.setIdentity();
+
+  // PI
+  for(int i = 0; i < size; ++i) res.row(m_transpositions.coeff(i)).swap(res.row(i));
+  // L^* P
+  res = matrixL().adjoint() * res;
+  // D(L^*P)
+  res = vectorD().asDiagonal() * res;
+  // L(DL^*P)
+  res = matrixL() * res;
+  // P^T (LDL^*P)
+  for (int i = size-1; i >= 0; --i) res.row(m_transpositions.coeff(i)).swap(res.row(i));
+
+  return res;
+}
+
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
   */
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 474b82406..96e1e5f73 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -117,7 +117,7 @@ template<typename _MatrixType, int _UpLo> class LLT
                 && "LLT::solve(): invalid number of rows of the right hand side matrix b");
       return ei_solve_retval<LLT, Rhs>(*this, b.derived());
     }
-    
+
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
 

From a7e4c0f8250ebcbab8cb26eea0730f12f5e4281d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 24 Feb 2010 11:28:38 +0100
Subject: [PATCH 022/122] make testsuite aware of EIGEN_CTEST_ARGS

---
 test/testsuite.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/testsuite.cmake b/test/testsuite.cmake
index 90edf2853..b68a327a9 100644
--- a/test/testsuite.cmake
+++ b/test/testsuite.cmake
@@ -147,6 +147,9 @@ endif(NOT EIGEN_NO_UPDATE)
 
 # which ctest command to use for running the dashboard
 SET (CTEST_COMMAND "${EIGEN_CMAKE_DIR}ctest -D ${EIGEN_MODE}")
+if($ENV{EIGEN_CTEST_ARGS})
+SET (CTEST_COMMAND "${CTEST_COMMAND} $ENV{EIGEN_CTEST_ARGS}")
+endif($ENV{EIGEN_CTEST_ARGS})
 # what cmake command to use for configuring this dashboard
 SET (CTEST_CMAKE_COMMAND "${EIGEN_CMAKE_DIR}cmake -DEIGEN_LEAVE_TEST_IN_ALL_TARGET=ON")
 

From 7c98c04412322e56b3b6f7e235bc7ebb61ab6b43 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 24 Feb 2010 19:16:10 +0100
Subject: [PATCH 023/122] add reconstructedMatrix() to LLT, and LUs => they
 show that some improvements have still to be done    for permutations, tr*tr,
 trapezoidal matrices

---
 Eigen/src/Cholesky/LDLT.h   |  4 ++--
 Eigen/src/Cholesky/LLT.h    | 12 ++++++++++++
 Eigen/src/LU/FullPivLU.h    | 29 +++++++++++++++++++++++++++++
 Eigen/src/LU/PartialPivLU.h | 20 ++++++++++++++++++++
 test/cholesky.cpp           |  7 +++----
 test/lu.cpp                 | 21 +++++++++++++++++++++
 6 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h
index 8cfc256bb..8699fe7e0 100644
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@@ -155,7 +155,7 @@ template<typename _MatrixType> class LDLT
       return m_matrix;
     }
 
-    const MatrixType reconstructedMatrix() const;
+    MatrixType reconstructedMatrix() const;
 
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
@@ -324,7 +324,7 @@ bool LDLT<MatrixType>::solveInPlace(MatrixBase<Derived> &bAndX) const
  * i.e., it returns the product: P^T L D L^* P.
  * This function is provided for debug purpose. */
 template<typename MatrixType>
-const MatrixType LDLT<MatrixType>::reconstructedMatrix() const
+MatrixType LDLT<MatrixType>::reconstructedMatrix() const
 {
   ei_assert(m_isInitialized && "LDLT is not initialized.");
   const int size = m_matrix.rows();
diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h
index 96e1e5f73..2e8df7661 100644
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@@ -133,6 +133,8 @@ template<typename _MatrixType, int _UpLo> class LLT
       return m_matrix;
     }
 
+    MatrixType reconstructedMatrix() const;
+
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
 
@@ -295,6 +297,16 @@ bool LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
   return true;
 }
 
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: L L^*.
+ * This function is provided for debug purpose. */
+template<typename MatrixType, int _UpLo>
+MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
+{
+  ei_assert(m_isInitialized && "LLT is not initialized.");
+  return matrixL() * matrixL().adjoint().toDenseMatrix();
+}
+
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
   */
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 0a305d52b..cd63b9ec7 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -361,6 +361,8 @@ template<typename _MatrixType> class FullPivLU
                (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
     }
 
+    MatrixType reconstructedMatrix() const;
+
     inline int rows() const { return m_lu.rows(); }
     inline int cols() const { return m_lu.cols(); }
 
@@ -487,6 +489,33 @@ typename ei_traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant() cons
   return Scalar(m_det_pq) * Scalar(m_lu.diagonal().prod());
 }
 
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: P^{-1} L U Q^{-1}.
+ * This function is provided for debug purpose. */
+template<typename MatrixType>
+MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
+{
+  ei_assert(m_isInitialized && "LU is not initialized.");
+  const int smalldim = std::min(m_lu.rows(), m_lu.cols());
+  // LU
+  MatrixType res(m_lu.rows(),m_lu.cols());
+  // FIXME the .toDenseMatrix() should not be needed...
+  res = m_lu.corner(TopLeft,m_lu.rows(),smalldim)
+            .template triangularView<UnitLower>().toDenseMatrix()
+      * m_lu.corner(TopLeft,smalldim,m_lu.cols())
+            .template triangularView<Upper>().toDenseMatrix();
+
+  // P^{-1}(LU)
+  // FIXME implement inplace permutation
+  res = (m_p.inverse() * res).eval();
+
+  // (P^{-1}LU)Q^{-1}
+  // FIXME implement inplace permutation
+  res = (res * m_q.inverse()).eval();
+
+  return res;
+}
+
 /********* Implementation of kernel() **************************************************/
 
 template<typename _MatrixType>
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index ed2354d78..fcffc2458 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -165,6 +165,8 @@ template<typename _MatrixType> class PartialPivLU
       */
     typename ei_traits<MatrixType>::Scalar determinant() const;
 
+    MatrixType reconstructedMatrix() const;
+
     inline int rows() const { return m_lu.rows(); }
     inline int cols() const { return m_lu.cols(); }
 
@@ -400,6 +402,24 @@ typename ei_traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() c
   return Scalar(m_det_p) * m_lu.diagonal().prod();
 }
 
+/** \returns the matrix represented by the decomposition,
+ * i.e., it returns the product: P^{-1} L U.
+ * This function is provided for debug purpose. */
+template<typename MatrixType>
+MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
+{
+  ei_assert(m_isInitialized && "LU is not initialized.");
+  // LU
+  MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix()
+                 * m_lu.template triangularView<Upper>();
+  
+  // P^{-1}(LU)
+  // FIXME implement inplace permutation
+  res = (m_p.inverse() * res).eval();
+
+  return res;
+}
+
 /***** Implementation of solve() *****************************************************/
 
 template<typename _MatrixType, typename Rhs>
diff --git a/test/cholesky.cpp b/test/cholesky.cpp
index 1bb808d20..a446f5d73 100644
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@@ -95,7 +95,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
 
   {
     LLT<SquareMatrixType,Lower> chollo(symmLo);
-    VERIFY_IS_APPROX(symm, chollo.matrixL().toDenseMatrix() * chollo.matrixL().adjoint().toDenseMatrix());
+    VERIFY_IS_APPROX(symm, chollo.reconstructedMatrix());
     vecX = chollo.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     matX = chollo.solve(matB);
@@ -103,7 +103,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
 
     // test the upper mode
     LLT<SquareMatrixType,Upper> cholup(symmUp);
-    VERIFY_IS_APPROX(symm, cholup.matrixL().toDenseMatrix() * cholup.matrixL().adjoint().toDenseMatrix());
+    VERIFY_IS_APPROX(symm, cholup.reconstructedMatrix());
     vecX = cholup.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     matX = cholup.solve(matB);
@@ -119,8 +119,7 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
 
   {
     LDLT<SquareMatrixType> ldlt(symm);
-    // TODO(keir): This doesn't make sense now that LDLT pivots.
-    //VERIFY_IS_APPROX(symm, ldlt.matrixL() * ldlt.vectorD().asDiagonal() * ldlt.matrixL().adjoint());
+    VERIFY_IS_APPROX(symm, ldlt.reconstructedMatrix());
     vecX = ldlt.solve(vecB);
     VERIFY_IS_APPROX(symm * vecX, vecB);
     matX = ldlt.solve(matB);
diff --git a/test/lu.cpp b/test/lu.cpp
index 442202a33..1ed38cb2b 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -91,6 +91,7 @@ template<typename MatrixType> void lu_non_invertible()
   KernelMatrixType m1kernel = lu.kernel();
   ImageMatrixType m1image = lu.image(m1);
 
+  VERIFY_IS_APPROX(m1, lu.reconstructedMatrix());
   VERIFY(rank == lu.rank());
   VERIFY(cols - lu.rank() == lu.dimensionOfKernel());
   VERIFY(!lu.isInjective());
@@ -125,6 +126,7 @@ template<typename MatrixType> void lu_invertible()
     lu.compute(m1);
   } while(!lu.isInvertible());
 
+  VERIFY_IS_APPROX(m1, lu.reconstructedMatrix());
   VERIFY(0 == lu.dimensionOfKernel());
   VERIFY(lu.kernel().cols() == 1); // the kernel() should consist of a single (zero) column vector
   VERIFY(size == lu.rank());
@@ -138,6 +140,23 @@ template<typename MatrixType> void lu_invertible()
   VERIFY_IS_APPROX(m2, lu.inverse()*m3);
 }
 
+template<typename MatrixType> void lu_partial_piv()
+{
+  /* this test covers the following files:
+     PartialPivLU.h
+  */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  int rows = ei_random<int>(1,4);
+  int cols = rows;
+
+  MatrixType m1(cols, rows);
+  m1.setRandom();
+  PartialPivLU<MatrixType> plu(m1);
+
+  VERIFY_IS_APPROX(m1, plu.reconstructedMatrix());
+}
+
 template<typename MatrixType> void lu_verify_assert()
 {
   MatrixType tmp;
@@ -180,6 +199,7 @@ void test_lu()
     
     CALL_SUBTEST_4( lu_non_invertible<MatrixXd>() );
     CALL_SUBTEST_4( lu_invertible<MatrixXd>() );
+    CALL_SUBTEST_4( lu_partial_piv<MatrixXd>() );
     CALL_SUBTEST_4( lu_verify_assert<MatrixXd>() );
     
     CALL_SUBTEST_5( lu_non_invertible<MatrixXcf>() );
@@ -188,6 +208,7 @@ void test_lu()
     
     CALL_SUBTEST_6( lu_non_invertible<MatrixXcd>() );
     CALL_SUBTEST_6( lu_invertible<MatrixXcd>() );
+    CALL_SUBTEST_6( lu_partial_piv<MatrixXcd>() );
     CALL_SUBTEST_6( lu_verify_assert<MatrixXcd>() );
 
     CALL_SUBTEST_7(( lu_non_invertible<Matrix<float,Dynamic,16> >() ));

From 0f3d69b65ee17d4ca9393fe1318ff239a411bfad Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Wed, 24 Feb 2010 21:43:30 +0100
Subject: [PATCH 024/122] Provide "eigen" defines to decide which instruction
 set is used (sse3, ssse3 and sse4), independantly from the compiler. Only
 those defines should be used in other places, and the user can rely on those
 to know which sets are used.

---
 Eigen/Core                           | 35 ++++++++++++++++++++++++----
 Eigen/src/Core/arch/SSE/PacketMath.h |  8 +++----
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index cbca16640..0306be3a8 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -61,20 +61,45 @@
 
 #ifndef EIGEN_DONT_VECTORIZE
   #if defined (EIGEN_SSE2_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_SSE
-    #include <emmintrin.h>
-    #include <xmmintrin.h>
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ..,
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
     #ifdef __SSE3__
-      #include <pmmintrin.h>
+      #define EIGEN_VECTORIZE_SSE3
     #endif
     #ifdef __SSSE3__
-      #include <tmmintrin.h>
+      #define EIGEN_VECTORIZE_SSSE3
     #endif
     #ifdef __SSE4_1__
-      #include <smmintrin.h>
+      #define EIGEN_VECTORIZE_SSE4_1
     #endif
     #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+
+    // include files
+
+    #include <emmintrin.h>
+    #include <xmmintrin.h>
+    #ifdef  EIGEN_VECTORIZE_SSE3
+      #include <pmmintrin.h>
+    #endif
+    #ifdef EIGEN_VECTORIZE_SSSE3
+      #include <tmmintrin.h>
+    #endif
+    #ifdef EIGEN_VECTORIZE_SSE4_1
+      #include <smmintrin.h>
+    #endif
+    #ifdef EIGEN_VECTORIZE_SSE4_2
       #include <nmmintrin.h>
     #endif
   #elif defined __ALTIVEC__
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a5a56f759..f78bf0dd3 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -122,7 +122,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, con
 template<> EIGEN_STRONG_INLINE Packet2d ei_pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
-#ifdef __SSE4_1__
+#ifdef EIGEN_VECTORIZE_SSE4_1
   return _mm_mullo_epi32(a,b);
 #else
   // this version is slightly faster than 4 scalar products
@@ -269,7 +269,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_pabs(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
 {
-  #ifdef __SSSE3__
+  #ifdef EIGEN_VECTORIZE_SSSE3
   return _mm_abs_epi32(a);
   #else
   Packet4i aux = _mm_srai_epi32(a,31);
@@ -278,7 +278,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a)
 }
 
 
-#ifdef __SSE3__
+#ifdef EIGEN_VECTORIZE_SSE3
 // TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
@@ -439,7 +439,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
 // }
 #endif
 
-#ifdef __SSSE3__
+#ifdef EIGEN_VECTORIZE_SSSE3
 // SSSE3 versions
 template<int Offset>
 struct ei_palign_impl<Offset,Packet4f>

From 00bc535b66641eb89f0608608ea64e0afda07e50 Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Wed, 24 Feb 2010 21:52:08 +0100
Subject: [PATCH 025/122] provide a static method to describe which SIMD
 instructions are used

---
 Eigen/Core | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Eigen/Core b/Eigen/Core
index 0306be3a8..a9003b294 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -146,6 +146,24 @@
 
 namespace Eigen {
 
+inline static const char *SimdInstructionsSetInUse(void) {
+#if defined(EIGEN_VECTORIZE_SSE4_2)
+  return "sse, sse2, sse3, ssse3, sse4.1, sse4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "sse, sse2, sse3, ssse3, sse4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "sse, sse2, sse3, ssse3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "sse, sse2, sse3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "sse, sse2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "Altivec";
+#else
+  return "None";
+#endif
+}
+
 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;

From 50a5ac3c4bfc658e59af3afdb01cd0b46960e7e3 Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Thu, 25 Feb 2010 05:31:22 +0100
Subject: [PATCH 026/122] oops, fix typo

---
 Eigen/Core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/Core b/Eigen/Core
index a9003b294..41372666b 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -149,7 +149,7 @@ namespace Eigen {
 inline static const char *SimdInstructionsSetInUse(void) {
 #if defined(EIGEN_VECTORIZE_SSE4_2)
   return "sse, sse2, sse3, ssse3, sse4.1, sse4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
   return "sse, sse2, sse3, ssse3, sse4.1";
 #elif defined(EIGEN_VECTORIZE_SSSE3)
   return "sse, sse2, sse3, ssse3";

From 77c922bf051862b240d841c025f6c388c776463e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 25 Feb 2010 06:43:45 -0500
Subject: [PATCH 027/122] * move the 's': InstructionsSet ---> InstructionSets
 * proper capitalization: SSE, AltiVec

---
 Eigen/Core | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 41372666b..5b28e6ba7 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -146,19 +146,19 @@
 
 namespace Eigen {
 
-inline static const char *SimdInstructionsSetInUse(void) {
+inline static const char *SimdInstructionSetsInUse(void) {
 #if defined(EIGEN_VECTORIZE_SSE4_2)
-  return "sse, sse2, sse3, ssse3, sse4.1, sse4.2";
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "sse, sse2, sse3, ssse3, sse4.1";
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
 #elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "sse, sse2, sse3, ssse3";
+  return "SSE, SSE2, SSE3, SSSE3";
 #elif defined(EIGEN_VECTORIZE_SSE3)
-  return "sse, sse2, sse3";
+  return "SSE, SSE2, SSE3";
 #elif defined(EIGEN_VECTORIZE_SSE2)
-  return "sse, sse2";
+  return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "Altivec";
+  return "AltiVec";
 #else
   return "None";
 #endif

From d9ca0c0d3643f4b777de686a2c0cddde075aa063 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 25 Feb 2010 15:31:15 +0100
Subject: [PATCH 028/122] optimize inverse permutations

---
 Eigen/src/Core/PermutationMatrix.h | 139 +++++++++++++++++++++++++----
 test/permutationmatrices.cpp       |   4 +-
 2 files changed, 122 insertions(+), 21 deletions(-)

diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index fcd2e46cc..c42812ec8 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -47,7 +47,7 @@
   * \sa class DiagonalMatrix
   */
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime> class PermutationMatrix;
-template<typename PermutationType, typename MatrixType, int Side> struct ei_permut_matrix_product_retval;
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false> struct ei_permut_matrix_product_retval;
 
 template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
 struct ei_traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> >
@@ -132,6 +132,9 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
     /** \returns the number of columns */
     inline int cols() const { return m_indices.size(); }
 
+    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
+    inline int size() const { return m_indices.size(); }
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
@@ -213,16 +216,29 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
       return *this;
     }
 
-    /**** inversion and multiplication helpers to hopefully get RVO ****/
+    /** \returns the inverse permutation matrix.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    inline Transpose<PermutationMatrix> inverse() const
+    { return *this; }
+    /** \returns the tranpose permutation matrix.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    inline Transpose<PermutationMatrix> transpose() const
+    { return *this; }
+
+    /**** multiplication helpers to hopefully get RVO ****/
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  protected:
-    enum Inverse_t {Inverse};
-    PermutationMatrix(Inverse_t, const PermutationMatrix& other)
-      : m_indices(other.m_indices.size())
+    template<int OtherSize, int OtherMaxSize>
+    PermutationMatrix(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other)
+      : m_indices(other.nestedPermutation().size())
     {
-      for (int i=0; i<rows();++i) m_indices.coeffRef(other.m_indices.coeff(i)) = i;
+      for (int i=0; i<rows();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
     }
+  protected:
     enum Product_t {Product};
     PermutationMatrix(Product_t, const PermutationMatrix& lhs, const PermutationMatrix& rhs)
       : m_indices(lhs.m_indices.size())
@@ -233,12 +249,7 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
 #endif
 
   public:
-    /** \returns the inverse permutation matrix.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    inline PermutationMatrix inverse() const
-    { return PermutationMatrix(Inverse, *this); }
+
     /** \returns the product permutation matrix.
       *
       * \note \note_try_to_help_rvo
@@ -247,6 +258,22 @@ class PermutationMatrix : public EigenBase<PermutationMatrix<SizeAtCompileTime,
     inline PermutationMatrix operator*(const PermutationMatrix<OtherSize, OtherMaxSize>& other) const
     { return PermutationMatrix(Product, *this, other); }
 
+    /** \returns the product of a permutation with another inverse permutation.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    template<int OtherSize, int OtherMaxSize>
+    inline PermutationMatrix operator*(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other) const
+    { return PermutationMatrix(Product, *this, other.eval()); }
+
+    /** \returns the product of an inverse permutation with another permutation.
+      *
+      * \note \note_try_to_help_rvo
+      */
+    template<int OtherSize, int OtherMaxSize> friend
+    inline PermutationMatrix operator*(const Transpose<PermutationMatrix<OtherSize,OtherMaxSize> >& other, const PermutationMatrix& perm)
+    { return PermutationMatrix(Product, other.eval(), perm); }
+
   protected:
 
     IndicesType m_indices;
@@ -277,15 +304,15 @@ operator*(const PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> &perm
            (permutation, matrix.derived());
 }
 
-template<typename PermutationType, typename MatrixType, int Side>
-struct ei_traits<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side> >
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
+struct ei_traits<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
 {
   typedef typename MatrixType::PlainObject ReturnType;
 };
 
-template<typename PermutationType, typename MatrixType, int Side>
+template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
 struct ei_permut_matrix_product_retval
- : public ReturnByValue<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side> >
+ : public ReturnByValue<ei_permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
 {
     typedef typename ei_cleantype<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
 
@@ -305,7 +332,7 @@ struct ei_permut_matrix_product_retval
           Dest,
           Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime,
           Side==OnTheRight ? 1 : Dest::ColsAtCompileTime
-        >(dst, Side==OnTheLeft ? m_permutation.indices().coeff(i) : i)
+        >(dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
 
         =
 
@@ -313,7 +340,7 @@ struct ei_permut_matrix_product_retval
           MatrixTypeNestedCleaned,
           Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,
           Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime
-        >(m_matrix, Side==OnTheRight ? m_permutation.indices().coeff(i) : i);
+        >(m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
       }
     }
 
@@ -322,4 +349,78 @@ struct ei_permut_matrix_product_retval
     const typename MatrixType::Nested m_matrix;
 };
 
+/* Template partial specialization for transposed/inverse permutations */
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
+struct ei_traits<Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> > >
+ : ei_traits<Matrix<int,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+{};
+
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime>
+class Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> >
+  : public EigenBase<Transpose<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> > >
+{
+    typedef PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime> PermutationType;
+    typedef typename PermutationType::IndicesType IndicesType;
+  public:
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    typedef ei_traits<PermutationType> Traits;
+    typedef Matrix<int,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime>
+            DenseMatrixType;
+    enum {
+      Flags = Traits::Flags,
+      CoeffReadCost = Traits::CoeffReadCost,
+      RowsAtCompileTime = Traits::RowsAtCompileTime,
+      ColsAtCompileTime = Traits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+    };
+    typedef typename Traits::Scalar Scalar;
+    #endif
+
+    Transpose(const PermutationType& p) : m_permutation(p) {}
+
+    inline int rows() const { return m_permutation.rows(); }
+    inline int cols() const { return m_permutation.cols(); }
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename DenseDerived>
+    void evalTo(MatrixBase<DenseDerived>& other) const
+    {
+      other.setZero();
+      for (int i=0; i<rows();++i)
+        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+    }
+    #endif
+
+    /** \return the equivalent permutation matrix */
+    PermutationType eval() const { return *this; }
+
+    DenseMatrixType toDenseMatrix() const { return *this; }
+
+    /** \returns the matrix with the inverse permutation applied to the columns.
+      */
+    template<typename Derived> friend
+    inline const ei_permut_matrix_product_retval<PermutationType, Derived, OnTheRight, true>
+    operator*(const MatrixBase<Derived>& matrix, const Transpose& trPerm)
+    {
+      return ei_permut_matrix_product_retval<PermutationType, Derived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+    }
+
+    /** \returns the matrix with the inverse permutation applied to the rows.
+      */
+    template<typename Derived>
+    inline const ei_permut_matrix_product_retval<PermutationType, Derived, OnTheLeft, true>
+    operator*(const MatrixBase<Derived>& matrix) const
+    {
+      return ei_permut_matrix_product_retval<PermutationType, Derived, OnTheLeft, true>(m_permutation, matrix.derived());
+    }
+
+    const PermutationType& nestedPermutation() const { return m_permutation; }
+
+  protected:
+    const PermutationType& m_permutation;
+};
+
 #endif // EIGEN_PERMUTATIONMATRIX_H
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index 0ef0a371a..ae1bd8b85 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -51,7 +51,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   typedef Matrix<int, Rows, 1> LeftPermutationVectorType;
   typedef PermutationMatrix<Cols> RightPermutationType;
   typedef Matrix<int, Cols, 1> RightPermutationVectorType;
-  
+
   int rows = m.rows();
   int cols = m.cols();
 
@@ -72,7 +72,7 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   Matrix<Scalar,Cols,Cols> rm(rp);
 
   VERIFY_IS_APPROX(m_permuted, lm*m_original*rm);
-  
+
   VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original);
   VERIFY((lp*lp.inverse()).toDenseMatrix().isIdentity());
 

From 959a1b5d6335833e9ad49a088502705bb6967ff5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 25 Feb 2010 16:30:58 +0100
Subject: [PATCH 029/122] detect and implement inplace permutations

---
 Eigen/src/Core/PermutationMatrix.h | 49 ++++++++++++++++++++++--------
 Eigen/src/Core/Transpose.h         | 19 ------------
 Eigen/src/Core/util/BlasUtil.h     | 18 +++++++++++
 Eigen/src/LU/FullPivLU.h           |  8 ++---
 Eigen/src/LU/PartialPivLU.h        |  5 ++-
 test/permutationmatrices.cpp       | 19 +++++++++++-
 6 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h
index c42812ec8..46884dc3f 100644
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@@ -326,21 +326,46 @@ struct ei_permut_matrix_product_retval
     template<typename Dest> inline void evalTo(Dest& dst) const
     {
       const int n = Side==OnTheLeft ? rows() : cols();
-      for(int i = 0; i < n; ++i)
+
+      if(ei_is_same_type<MatrixTypeNestedCleaned,Dest>::ret && ei_extract_data(dst) == ei_extract_data(m_matrix))
       {
-        Block<
-          Dest,
-          Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime,
-          Side==OnTheRight ? 1 : Dest::ColsAtCompileTime
-        >(dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
+        mask.fill(false);
+        int r = 0;
+        while(r < m_permutation.size())
+        {
+          // search for the next seed
+          while(r<m_permutation.size() && mask[r]) r++;
+          if(r>=m_permutation.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          int k0 = r++;
+          int kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(int k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
 
-        =
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(int i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
 
-        Block<
-          MatrixTypeNestedCleaned,
-          Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,
-          Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime
-        >(m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
+          =
+
+          Block<MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
+               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
+        }
       }
     }
 
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index bd06d8464..6c0e50de2 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -295,25 +295,6 @@ struct ei_blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr> >
   static inline const XprType extract(const XprType& x) { return x; }
 };
 
-
-template<typename T, int Access=ei_blas_traits<T>::ActualAccess>
-struct ei_extract_data_selector {
-  static typename T::Scalar* run(const T& m)
-  {
-    return &ei_blas_traits<T>::extract(m).const_cast_derived().coeffRef(0,0);
-  }
-};
-
-template<typename T>
-struct ei_extract_data_selector<T,NoDirectAccess> {
-  static typename T::Scalar* run(const T&) { return 0; }
-};
-
-template<typename T> typename T::Scalar* ei_extract_data(const T& m)
-{
-  return ei_extract_data_selector<T>::run(m);
-}
-
 template<typename Scalar, bool DestIsTranposed, typename OtherDerived>
 struct ei_check_transpose_aliasing_selector
 {
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 2ca463d5d..4d216d77a 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -236,4 +236,22 @@ struct ei_blas_traits<Transpose<NestedXpr> >
   static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
 };
 
+template<typename T, int Access=ei_blas_traits<T>::ActualAccess>
+struct ei_extract_data_selector {
+  static const typename T::Scalar* run(const T& m)
+  {
+    return &ei_blas_traits<T>::extract(m).const_cast_derived().coeffRef(0,0); // FIXME this should be .data()
+  }
+};
+
+template<typename T>
+struct ei_extract_data_selector<T,NoDirectAccess> {
+  static typename T::Scalar* run(const T&) { return 0; }
+};
+
+template<typename T> const typename T::Scalar* ei_extract_data(const T& m)
+{
+  return ei_extract_data_selector<T>::run(m);
+}
+
 #endif // EIGEN_BLASUTIL_H
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index cd63b9ec7..dea6ec41c 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -119,7 +119,7 @@ template<typename _MatrixType> class FullPivLU
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-    
+
     /** \returns the permutation matrix P
       *
       * \sa permutationQ()
@@ -506,12 +506,10 @@ MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
             .template triangularView<Upper>().toDenseMatrix();
 
   // P^{-1}(LU)
-  // FIXME implement inplace permutation
-  res = (m_p.inverse() * res).eval();
+  res = m_p.inverse() * res;
 
   // (P^{-1}LU)Q^{-1}
-  // FIXME implement inplace permutation
-  res = (res * m_q.inverse()).eval();
+  res = res * m_q.inverse();
 
   return res;
 }
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index fcffc2458..ad0d6b810 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -412,10 +412,9 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
   // LU
   MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix()
                  * m_lu.template triangularView<Upper>();
-  
+
   // P^{-1}(LU)
-  // FIXME implement inplace permutation
-  res = (m_p.inverse() * res).eval();
+  res = m_p.inverse() * res;
 
   return res;
 }
diff --git a/test/permutationmatrices.cpp b/test/permutationmatrices.cpp
index ae1bd8b85..89142d910 100644
--- a/test/permutationmatrices.cpp
+++ b/test/permutationmatrices.cpp
@@ -86,6 +86,23 @@ template<typename MatrixType> void permutationmatrices(const MatrixType& m)
   identityp.setIdentity(rows);
   VERIFY_IS_APPROX(m_original, identityp*m_original);
 
+  // check inplace permutations
+  m_permuted = m_original;
+  m_permuted = lp.inverse() * m_permuted;
+  VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original);
+
+  m_permuted = m_original;
+  m_permuted = m_permuted * rp.inverse();
+  VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse());
+
+  m_permuted = m_original;
+  m_permuted = lp * m_permuted;
+  VERIFY_IS_APPROX(m_permuted, lp*m_original);
+
+  m_permuted = m_original;
+  m_permuted = m_permuted * rp;
+  VERIFY_IS_APPROX(m_permuted, m_original*rp);
+
   if(rows>1 && cols>1)
   {
     lp2 = lp;
@@ -114,7 +131,7 @@ void test_permutationmatrices()
     CALL_SUBTEST_2( permutationmatrices(Matrix3f()) );
     CALL_SUBTEST_3( permutationmatrices(Matrix<double,3,3,RowMajor>()) );
     CALL_SUBTEST_4( permutationmatrices(Matrix4d()) );
-    CALL_SUBTEST_5( permutationmatrices(Matrix<double,4,6>()) );
+    CALL_SUBTEST_5( permutationmatrices(Matrix<double,40,60>()) );
     CALL_SUBTEST_6( permutationmatrices(Matrix<double,Dynamic,Dynamic,RowMajor>(20, 30)) );
     CALL_SUBTEST_7( permutationmatrices(MatrixXcf(15, 10)) );
   }

From 53bae6b3f8129bb4e0e2790255911b46bb09c0d5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 25 Feb 2010 21:59:25 +0100
Subject: [PATCH 030/122] update matrix product selection rules for
 1xSmallxLarge and the transposed case

---
 Eigen/src/Core/Product.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index fe6d29c7d..236e4f130 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -95,10 +95,10 @@ template<>                    struct ei_product_type_selector<Small, Large, 1>
 template<>                    struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<1,    Large,Small>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
-template<>                    struct ei_product_type_selector<1,    Small,Large>  { enum { ret = GemvProduct }; };
+template<>                    struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Large,1,    Small>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
-template<>                    struct ei_product_type_selector<Small,1,    Large>  { enum { ret = GemvProduct }; };
+template<>                    struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
 template<>                    struct ei_product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
 template<>                    struct ei_product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };

From 90e4a605ef920759a23cdbd24e6e7b69ce549162 Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Thu, 25 Feb 2010 22:33:38 +0000
Subject: [PATCH 031/122] ComplexSchur: compute shift more stably, introduce
 exceptional shifts. Both the new computation of  the eigenvalues of a 2x2
 block and the exceptional shifts are taken from EISPACK routine COMQR.

---
 Eigen/src/Eigenvalues/ComplexSchur.h | 42 +++++++++++++++-------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 5deac3247..0fad415a2 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -154,6 +154,14 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
   m_matT = hess.matrixH();
   if(!skipU)  m_matU = hess.matrixQ();
 
+
+  // Reduce the Hessenberg matrix m_matT to triangular form by QR iteration.
+
+  // The matrix m_matT is divided in three parts. 
+  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero. 
+  // Rows il,...,iu is the part we are working on (the active submatrix).
+  // Rows iu+1,...,end are already brought in triangular form.
+
   int iu = m_matT.cols() - 1;
   int il;
   RealScalar d,sd,sf;
@@ -164,7 +172,7 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
   int iter = 0;
   while(true)
   {
-    //locate the range in which to iterate
+    // find iu, the bottom row of the active submatrix
     while(iu > 0)
     {
       d = ei_norm1(m_matT.coeff(iu,iu)) + ei_norm1(m_matT.coeff(iu-1,iu-1));
@@ -187,6 +195,7 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
       return;
     }
 
+    // find il, the top row of the active submatrix
     il = iu-1;
     while(il > 0)
     {
@@ -202,15 +211,16 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
 
     if( il != 0 ) m_matT.coeffRef(il,il-1) = Complex(0);
 
-    // compute the shift (the normalization by sf is to avoid under/overflow)
+    // compute the shift kappa as one of the eigenvalues of the 2x2
+    // diagonal block on the bottom of the active submatrix
+
     Matrix<Scalar,2,2> t = m_matT.template block<2,2>(iu-1,iu-1);
     sf = t.cwiseAbs().sum();
-    t /= sf;
+    t /= sf;     // the normalization by sf is to avoid under/overflow
 
-    c = t.determinant();
-    b = t.diagonal().sum();
-
-    disc = ei_sqrt(b*b - RealScalar(4)*c);
+    b = t.coeff(0,0) + t.coeff(1,1);
+    c = t.coeff(0,0) - t.coeff(1,1);
+    disc = ei_sqrt(c*c + RealScalar(4)*t.coeff(0,1)*t.coeff(1,0));
 
     r1 = (b+disc)/RealScalar(2);
     r2 = (b-disc)/RealScalar(2);
@@ -224,6 +234,12 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
       kappa = sf * r1;
     else
       kappa = sf * r2;
+    
+    if (iter == 10 || iter == 20) 
+    {
+      // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
+      kappa = ei_abs(ei_real(m_matT.coeff(iu,iu-1))) + ei_abs(ei_real(m_matT.coeff(iu-1,iu-2)));
+    }
 
     // perform the QR step using Givens rotations
     PlanarRotation<Complex> rot;
@@ -246,18 +262,6 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
     }
   }
 
-  // FIXME : is it necessary ?
-  /*
-  for(int i=0 ; i<n ; i++)
-    for(int j=0 ; j<n ; j++)
-    {
-      if(ei_abs(ei_real(m_matT.coeff(i,j))) < eps)
-        ei_real_ref(m_matT.coeffRef(i,j)) = 0;
-      if(ei_imag(ei_abs(m_matT.coeff(i,j))) < eps)
-        ei_imag_ref(m_matT.coeffRef(i,j)) = 0;
-    }
-  */
-
   m_isInitialized = true;
   m_matUisUptodate = !skipU;
 }

From 769641bc58745fecc1fa4e537466a1fff48f4a8a Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 25 Feb 2010 21:01:52 -0500
Subject: [PATCH 032/122] * Implement the ByOuterInner accessors * use them
 (big simplification in Assign.h) * axe (Inner|Outer)StrideAtCompileTime that
 were just introduced * ei_int_if_dynamic now asserts that the size is the
 expected one: adapt to that in Block.h * add rowStride() / colStride() in
 DenseBase * implement innerStride() / outerStride() everywhere needed

---
 Eigen/src/Array/Array.h                       |   3 +
 Eigen/src/Array/ArrayWrapper.h                |   6 +-
 Eigen/src/Core/Assign.h                       | 134 +++++-------------
 Eigen/src/Core/Block.h                        |  29 +---
 Eigen/src/Core/Coeffs.h                       |  73 ++++++++++
 Eigen/src/Core/DenseBase.h                    |  68 ++++++++-
 Eigen/src/Core/DenseStorageBase.h             |  25 +---
 Eigen/src/Core/Flagged.h                      |   3 +-
 Eigen/src/Core/ForceAlignedAccess.h           |   3 +-
 Eigen/src/Core/Map.h                          |  10 +-
 Eigen/src/Core/MapBase.h                      |  45 +-----
 Eigen/src/Core/Matrix.h                       |   5 +-
 Eigen/src/Core/NestByValue.h                  |   3 +-
 Eigen/src/Core/Product.h                      |   4 +-
 Eigen/src/Core/SelfAdjointView.h              |   5 +-
 Eigen/src/Core/SelfCwiseBinaryOp.h            |   3 +-
 Eigen/src/Core/Stride.h                       |  56 --------
 Eigen/src/Core/Swap.h                         |  11 +-
 Eigen/src/Core/Transpose.h                    |   3 -
 Eigen/src/Core/TriangularMatrix.h             |   6 +-
 Eigen/src/Core/VectorBlock.h                  |   1 -
 Eigen/src/Core/products/GeneralMatrixMatrix.h |   7 +-
 Eigen/src/Core/util/Constants.h               |   2 +-
 Eigen/src/Core/util/Memory.h                  |   2 +-
 Eigen/src/LU/PartialPivLU.h                   |   2 +-
 Eigen/src/Sparse/CholmodSupport.h             |   2 +-
 Eigen/src/Sparse/SuperLUSupport.h             |   4 +-
 test/submatrices.cpp                          |   6 +-
 28 files changed, 236 insertions(+), 285 deletions(-)

diff --git a/Eigen/src/Array/Array.h b/Eigen/src/Array/Array.h
index ceef71afd..533d638a4 100644
--- a/Eigen/src/Array/Array.h
+++ b/Eigen/src/Array/Array.h
@@ -213,6 +213,9 @@ class Array
     void swap(ArrayBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
+    inline int innerStride() const { return 1; }
+    inline int outerStride() const { return this->innerSize(); }
+
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
     #endif
diff --git a/Eigen/src/Array/ArrayWrapper.h b/Eigen/src/Array/ArrayWrapper.h
index 75bc33770..0075dd537 100644
--- a/Eigen/src/Array/ArrayWrapper.h
+++ b/Eigen/src/Array/ArrayWrapper.h
@@ -55,7 +55,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
 
     inline int rows() const { return m_expression.rows(); }
     inline int cols() const { return m_expression.cols(); }
-    inline int stride() const { return m_expression.stride(); }
+    inline int outerStride() const { return m_expression.outerStride(); }
+    inline int innerStride() const { return m_expression.innerStride(); }
 
     inline const CoeffReturnType coeff(int row, int col) const
     {
@@ -139,7 +140,8 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
 
     inline int rows() const { return m_expression.rows(); }
     inline int cols() const { return m_expression.cols(); }
-    inline int stride() const { return m_expression.stride(); }
+    inline int outerStride() const { return m_expression.outerStride(); }
+    inline int innerStride() const { return m_expression.innerStride(); }
 
     inline const CoeffReturnType coeff(int row, int col) const
     {
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 174fd0080..3133aa03a 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -55,7 +55,9 @@ private:
   };
 
   enum {
-    StorageOrdersAgree = (int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit),
+    LhsIsEffectivelyRowMajor = (Derived::RowsAtCompileTime==1) || (int(Derived::Flags)&RowMajorBit),
+    RhsIsEffectivelyRowMajor = (OtherDerived::RowsAtCompileTime==1) || (int(OtherDerived::Flags)&RowMajorBit),
+    StorageOrdersAgree = (LhsIsEffectivelyRowMajor == RhsIsEffectivelyRowMajor),
     MightVectorize = StorageOrdersAgree
                   && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
     MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
@@ -139,17 +141,13 @@ template<typename Derived1, typename Derived2, int Index, int Stop>
 struct ei_assign_DefaultTraversal_CompleteUnrolling
 {
   enum {
-    row = int(Derived1::Flags)&RowMajorBit
-        ? Index / int(Derived1::ColsAtCompileTime)
-        : Index % Derived1::RowsAtCompileTime,
-    col = int(Derived1::Flags)&RowMajorBit
-        ? Index % int(Derived1::ColsAtCompileTime)
-        : Index / Derived1::RowsAtCompileTime
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime
   };
 
   EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.copyCoeff(row, col, src);
+    dst.copyCoeffByOuterInner(outer, inner, src);
     ei_assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
   }
 };
@@ -163,13 +161,10 @@ struct ei_assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, St
 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct ei_assign_DefaultTraversal_InnerUnrolling
 {
-  EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int row_or_col)
+  EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int outer)
   {
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int row = rowMajor ? row_or_col : Index;
-    const int col = rowMajor ? Index : row_or_col;
-    dst.copyCoeff(row, col, src);
-    ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, row_or_col);
+    dst.copyCoeffByOuterInner(outer, Index, src);
+    ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
   }
 };
 
@@ -207,18 +202,14 @@ template<typename Derived1, typename Derived2, int Index, int Stop>
 struct ei_assign_innervec_CompleteUnrolling
 {
   enum {
-    row = int(Derived1::Flags)&RowMajorBit
-        ? Index / int(Derived1::ColsAtCompileTime)
-        : Index % Derived1::RowsAtCompileTime,
-    col = int(Derived1::Flags)&RowMajorBit
-        ? Index % int(Derived1::ColsAtCompileTime)
-        : Index / Derived1::RowsAtCompileTime,
+    outer = Index / Derived1::InnerSizeAtCompileTime,
+    inner = Index % Derived1::InnerSizeAtCompileTime,
     JointAlignment = ei_assign_traits<Derived1,Derived2>::JointAlignment
   };
 
   EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
-    dst.template copyPacket<Derived2, Aligned, JointAlignment>(row, col, src);
+    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
     ei_assign_innervec_CompleteUnrolling<Derived1, Derived2,
       Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
   }
@@ -233,13 +224,11 @@ struct ei_assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
 template<typename Derived1, typename Derived2, int Index, int Stop>
 struct ei_assign_innervec_InnerUnrolling
 {
-  EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int row_or_col)
+  EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src, int outer)
   {
-    const int row = int(Derived1::Flags)&RowMajorBit ? row_or_col : Index;
-    const int col = int(Derived1::Flags)&RowMajorBit ? Index : row_or_col;
-    dst.template copyPacket<Derived2, Aligned, Aligned>(row, col, src);
+    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
     ei_assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, row_or_col);
+      Index+ei_packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
   }
 };
 
@@ -267,29 +256,11 @@ struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
 {
   inline static void run(Derived1 &dst, const Derived2 &src)
   {
-    if(Derived1::ColsAtCompileTime == 1)
-    {
-      for(int i = 0; i < dst.rows(); ++i)
-        dst.copyCoeff(i, 0, src);
-    }
-    else if(Derived1::RowsAtCompileTime == 1)
-    {
-      for(int i = 0; i < dst.cols(); ++i)
-        dst.copyCoeff(0, i, src);
-    }
-    else
-    {
-      const int innerSize = dst.innerSize();
-      const int outerSize = dst.outerSize();
-      for(int j = 0; j < outerSize; ++j)
-        for(int i = 0; i < innerSize; ++i)
-        {
-          if(int(Derived1::Flags)&RowMajorBit)
-            dst.copyCoeff(j, i, src);
-          else
-            dst.copyCoeff(i, j, src);
-        }
-    }
+    const int innerSize = dst.innerSize();
+    const int outerSize = dst.outerSize();
+    for(int outer = 0; outer < outerSize; ++outer)
+      for(int inner = 0; inner < innerSize; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
   }
 };
 
@@ -308,12 +279,10 @@ struct ei_assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling>
 {
   EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
     const int outerSize = dst.outerSize();
-    for(int j = 0; j < outerSize; ++j)
-      ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, innerSize>
-        ::run(dst, src, j);
+    for(int outer = 0; outer < outerSize; ++outer)
+      ei_assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
   }
 };
 
@@ -354,14 +323,9 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling>
     const int innerSize = dst.innerSize();
     const int outerSize = dst.outerSize();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
-    for(int j = 0; j < outerSize; ++j)
-      for(int i = 0; i < innerSize; i+=packetSize)
-      {
-        if(int(Derived1::Flags)&RowMajorBit)
-          dst.template copyPacket<Derived2, Aligned, Aligned>(j, i, src);
-        else
-          dst.template copyPacket<Derived2, Aligned, Aligned>(i, j, src);
-      }
+    for(int outer = 0; outer < outerSize; ++outer)
+      for(int inner = 0; inner < innerSize; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
   }
 };
 
@@ -380,12 +344,10 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolli
 {
   EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
-    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
     const int outerSize = dst.outerSize();
-    for(int j = 0; j < outerSize; ++j)
-      ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, innerSize>
-        ::run(dst, src, j);
+    for(int outer = 0; outer < outerSize; ++outer)
+      ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
+        ::run(dst, src, outer);
   }
 };
 
@@ -471,36 +433,20 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling>
     int alignedStart = ei_assign_traits<Derived1,Derived2>::DstIsAligned ? 0
                      : ei_first_aligned(&dst.coeffRef(0,0), innerSize);
 
-    for(int i = 0; i < outerSize; ++i)
+    for(int outer = 0; outer < outerSize; ++outer)
     {
       const int alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-
       // do the non-vectorizable part of the assignment
-      for (int index = 0; index<alignedStart ; ++index)
-      {
-        if(Derived1::Flags&RowMajorBit)
-          dst.copyCoeff(i, index, src);
-        else
-          dst.copyCoeff(index, i, src);
-      }
+      for(int inner = 0; inner<alignedStart ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
 
       // do the vectorizable part of the assignment
-      for (int index = alignedStart; index<alignedEnd; index+=packetSize)
-      {
-        if(Derived1::Flags&RowMajorBit)
-          dst.template copyPacket<Derived2, Aligned, Unaligned>(i, index, src);
-        else
-          dst.template copyPacket<Derived2, Aligned, Unaligned>(index, i, src);
-      }
+      for(int inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        dst.template copyPacketByOuterInner<Derived2, Aligned, Unaligned>(outer, inner, src);
 
       // do the non-vectorizable part of the assignment
-      for (int index = alignedEnd; index<innerSize ; ++index)
-      {
-        if(Derived1::Flags&RowMajorBit)
-          dst.copyCoeff(i, index, src);
-        else
-          dst.copyCoeff(index, i, src);
-      }
+      for(int inner = alignedEnd; inner<innerSize ; ++inner)
+        dst.copyCoeffByOuterInner(outer, inner, src);
 
       alignedStart = std::min<int>((alignedStart+alignedStep)%packetSize, innerSize);
     }
@@ -519,14 +465,6 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
   EIGEN_STATIC_ASSERT((ei_is_same_type<typename Derived::Scalar, typename OtherDerived::Scalar>::ret),
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-  if(Derived::ColsAtCompileTime == 1)
-  {
-    ei_assert(OtherDerived::RowsAtCompileTime == 1 || other.cols() == 1);
-  }
-  if(Derived::RowsAtCompileTime == 1)
-  {
-    ei_assert(OtherDerived::ColsAtCompileTime == 1 || other.rows() == 1);
-  }
 #ifdef EIGEN_DEBUG_ASSIGN
   ei_assign_traits<Derived, OtherDerived>::debug();
 #endif
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 8a7aea91f..d3c4dfa99 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -80,20 +80,6 @@ struct ei_traits<Block<MatrixType, BlockRows, BlockCols, _DirectAccessStatus> >
   };
 };
 
-template<typename MatrixType, int BlockRows, int BlockCols>
-struct ei_traits<Block<MatrixType, BlockRows, BlockCols, true> > : ei_traits<Block<MatrixType, BlockRows, BlockCols, false> >
-{
-  enum {
-    InnerStrideAtCompileTime =
-        (BlockRows==1 && !(int(MatrixType::Flags)&RowMajorBit))
-        || (BlockCols==1 && (int(MatrixType::Flags)&RowMajorBit))
-        ? MatrixType::OuterStrideAtCompileTime
-        : MatrixType::InnerStrideAtCompileTime,
-    OuterStrideAtCompileTime =
-        (BlockRows==1||BlockCols==1) ? 0 : MatrixType::OuterStrideAtCompileTime
-  };
-};
-
 template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessStatus> class Block
   : public MatrixType::template MakeBase< Block<MatrixType, BlockRows, BlockCols, _DirectAccessStatus> >::Type
 {
@@ -114,8 +100,8 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
         // The case a 1x1 matrix seems ambiguous, but the result is the same anyway.
         m_startRow( (BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) ? i : 0),
         m_startCol( (BlockRows==MatrixType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
-        m_blockRows(matrix.rows()), // if it is a row, then m_blockRows has a fixed-size of 1, so no pb to try to overwrite it
-        m_blockCols(matrix.cols())  // same for m_blockCols
+        m_blockRows(BlockRows==1 ? 1 : matrix.rows()),
+        m_blockCols(BlockCols==1 ? 1 : matrix.cols())
     {
       ei_assert( (i>=0) && (
           ((BlockRows==1) && (BlockCols==MatrixType::ColsAtCompileTime) && i<matrix.rows())
@@ -126,7 +112,7 @@ template<typename MatrixType, int BlockRows, int BlockCols, int _DirectAccessSta
       */
     inline Block(const MatrixType& matrix, int startRow, int startCol)
       : m_matrix(matrix), m_startRow(startRow), m_startCol(startCol),
-        m_blockRows(matrix.rows()), m_blockCols(matrix.cols())
+        m_blockRows(BlockRows), m_blockCols(BlockCols)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
       ei_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= matrix.rows()
@@ -277,16 +263,15 @@ class Block<MatrixType,BlockRows,BlockCols,HasDirectAccess>
     /** \sa MapBase::innerStride() */
     inline int innerStride() const
     {
-      return (RowsAtCompileTime==1 && !(int(MatrixType::Flags)&RowMajorBit))
-          || (ColsAtCompileTime==1 && (int(MatrixType::Flags)&RowMajorBit))
-          ? m_matrix.outerStride()
-          : m_matrix.innerStride();
+      return RowsAtCompileTime==1 ? m_matrix.colStride()
+           : ColsAtCompileTime==1 ? m_matrix.rowStride()
+           : m_matrix.innerStride();
     }
     
     /** \sa MapBase::outerStride() */
     inline int outerStride() const
     {
-      return IsVectorAtCompileTime ? 0 : m_matrix.outerStride();
+      return IsVectorAtCompileTime ? this->size() : m_matrix.outerStride();
     }
 
   #ifndef __SUNPRO_CC
diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index ebfd0c80e..da7b9153f 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -25,6 +25,24 @@
 #ifndef EIGEN_COEFFS_H
 #define EIGEN_COEFFS_H
 
+template<typename Derived>
+EIGEN_STRONG_INLINE int DenseBase<Derived>::rowIndexByOuterInner(int outer, int inner)
+{
+  return int(Derived::RowsAtCompileTime) == 1 ? 0
+       : int(Derived::ColsAtCompileTime) == 1 ? inner
+       : int(Derived::Flags)&RowMajorBit ? outer
+       : inner;
+}
+
+template<typename Derived>
+EIGEN_STRONG_INLINE int DenseBase<Derived>::colIndexByOuterInner(int outer, int inner)
+{
+  return int(Derived::ColsAtCompileTime) == 1 ? 0
+       : int(Derived::RowsAtCompileTime) == 1 ? inner
+       : int(Derived::Flags)&RowMajorBit ? inner
+       : outer;
+}
+
 /** Short version: don't use this function, use
   * \link operator()(int,int) const \endlink instead.
   *
@@ -48,6 +66,14 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase
   return derived().coeff(row, col);
 }
 
+template<typename Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
+  ::coeffByOuterInner(int outer, int inner) const
+{
+  return coeff(rowIndexByOuterInner(outer, inner),
+               colIndexByOuterInner(outer, inner));
+}
+
 /** \returns the coefficient at given the given row and column.
   *
   * \sa operator()(int,int), operator[](int) const
@@ -84,6 +110,14 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
   return derived().coeffRef(row, col);
 }
 
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
+  ::coeffRefByOuterInner(int outer, int inner)
+{
+  return coeffRef(rowIndexByOuterInner(outer, inner),
+                  colIndexByOuterInner(outer, inner));
+}
+
 /** \returns a reference to the coefficient at given the given row and column.
   *
   * \sa operator()(int,int) const, operator[](int)
@@ -261,6 +295,15 @@ DenseBase<Derived>::packet(int row, int col) const
   return derived().template packet<LoadMode>(row,col);
 }
 
+template<typename Derived>
+template<int LoadMode>
+EIGEN_STRONG_INLINE typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type
+DenseBase<Derived>::packetByOuterInner(int outer, int inner) const
+{
+  return packet<LoadMode>(rowIndexByOuterInner(outer, inner),
+                          colIndexByOuterInner(outer, inner));
+}
+
 /** Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
   * to ensure that a packet really starts there. This method is only available on expressions having the
   * PacketAccessBit.
@@ -279,6 +322,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::writePacket
   derived().template writePacket<StoreMode>(row,col,x);
 }
 
+template<typename Derived>
+template<int StoreMode>
+EIGEN_STRONG_INLINE void DenseBase<Derived>::writePacketByOuterInner
+(int outer, int inner, const typename ei_packet_traits<typename ei_traits<Derived>::Scalar>::type& x)
+{
+  writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
+                         colIndexByOuterInner(outer, inner),
+                         x);
+}
+
 /** \returns the packet of coefficients starting at the given index. It is your responsibility
   * to ensure that a packet really starts there. This method is only available on expressions having the
   * PacketAccessBit and the LinearAccessBit.
@@ -346,6 +399,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::copyCoeff(int index, const DenseBas
   derived().coeffRef(index) = other.derived().coeff(index);
 }
 
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE void DenseBase<Derived>::copyCoeffByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other)
+{
+  const int row = Derived::rowIndexByOuterInner(outer,inner);
+  const int col = Derived::colIndexByOuterInner(outer,inner);
+  // derived() is important here: copyCoeff() may be reimplemented in Derived!
+  derived().copyCoeff(row, col, other);
+}
+
 /** \internal Copies the packet at position (row,col) of other into *this.
   *
   * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
@@ -379,6 +442,16 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::copyPacket(int index, const DenseBa
     other.derived().template packet<LoadMode>(index));
 }
 
+template<typename Derived>
+template<typename OtherDerived, int StoreMode, int LoadMode>
+EIGEN_STRONG_INLINE void DenseBase<Derived>::copyPacketByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other)
+{
+  const int row = Derived::rowIndexByOuterInner(outer,inner);
+  const int col = Derived::colIndexByOuterInner(outer,inner);
+  // derived() is important here: copyCoeff() may be reimplemented in Derived!
+  derived().copyPacket<OtherDerived, StoreMode, LoadMode>(row, col, other);
+}
+
 template<typename Derived, bool JustReturnZero>
 struct ei_first_aligned_impl
 {
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index d8f789ae0..2078f023b 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -124,6 +124,11 @@ template<typename Derived> class DenseBase
           * constructed from this one. See the \ref flags "list of flags".
           */
 
+      IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression is row major. */
+
+      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? SizeAtCompileTime
+                             : int(Flags)&RowMajorBit ? ColsAtCompileTime : RowsAtCompileTime,
+
       CoeffReadCost = ei_traits<Derived>::CoeffReadCost,
         /**< This is a rough measure of how expensive it is to read one coefficient from
           * this expression.
@@ -200,20 +205,64 @@ template<typename Derived> class DenseBase
                 && "DenseBase::resize() does not actually allow to resize.");
     }
 
-    int innerStride() const
+    /** \returns the pointer increment between two consecutive elements.
+      *
+      * \note For vectors, the storage order is ignored. For matrices (non-vectors), we're looking
+      *       at the increment between two consecutive elements within a slice in the inner direction.
+      *
+      * \sa outerStride(), rowStride(), colStride()
+      */
+    inline int innerStride() const
     {
       EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
                           THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
       return derived().innerStride();
     }
 
-    int outerStride() const
+    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+      *          in a column-major matrix).
+      *
+      * \note For vectors, the storage order is ignored, there is only one inner slice, and so this method returns 1.
+      *       For matrices (non-vectors), the notion of inner slice depends on the storage order.
+      *
+      * \sa innerStride(), rowStride(), colStride()
+      */
+    inline int outerStride() const
     {
       EIGEN_STATIC_ASSERT(int(Flags)&DirectAccessBit,
                           THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
       return derived().outerStride();
     }
 
+    inline int stride() const
+    {
+      return IsVectorAtCompileTime ? innerStride() : outerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive rows.
+      *
+      * \sa innerStride(), outerStride(), colStride()
+      */
+    inline int rowStride() const
+    {
+      return ColsAtCompileTime==1 ? innerStride()
+           : RowsAtCompileTime==1 ? outerStride()
+           : IsRowMajor ? outerStride()
+           : innerStride();
+    }
+
+    /** \returns the pointer increment between two consecutive columns.
+      *
+      * \sa innerStride(), outerStride(), rowStride()
+      */
+    inline int colStride() const
+    {
+      return ColsAtCompileTime==1 ? outerStride()
+           : RowsAtCompileTime==1 ? innerStride()
+           : IsRowMajor ? innerStride()
+           : outerStride();
+    }
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal the return type of coeff()
       */
@@ -269,9 +318,11 @@ template<typename Derived> class DenseBase
     CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
 
     const CoeffReturnType coeff(int row, int col) const;
+    const CoeffReturnType coeffByOuterInner(int outer, int inner) const;
     const CoeffReturnType operator()(int row, int col) const;
 
     Scalar& coeffRef(int row, int col);
+    Scalar& coeffRefByOuterInner(int outer, int inner);
     Scalar& operator()(int row, int col);
 
     const CoeffReturnType coeff(int index) const;
@@ -286,17 +337,30 @@ template<typename Derived> class DenseBase
     template<typename OtherDerived>
     void copyCoeff(int row, int col, const DenseBase<OtherDerived>& other);
     template<typename OtherDerived>
+    void copyCoeffByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other);
+    template<typename OtherDerived>
     void copyCoeff(int index, const DenseBase<OtherDerived>& other);
     template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int row, int col, const DenseBase<OtherDerived>& other);
     template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacketByOuterInner(int outer, int inner, const DenseBase<OtherDerived>& other);
+    template<typename OtherDerived, int StoreMode, int LoadMode>
     void copyPacket(int index, const DenseBase<OtherDerived>& other);
+
+  private:
+    static int rowIndexByOuterInner(int outer, int inner);
+    static int colIndexByOuterInner(int outer, int inner);
+  public:
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     template<int LoadMode>
     PacketScalar packet(int row, int col) const;
+    template<int LoadMode>
+    PacketScalar packetByOuterInner(int outer, int inner) const;
     template<int StoreMode>
     void writePacket(int row, int col, const PacketScalar& x);
+    template<int StoreMode>
+    void writePacketByOuterInner(int outer, int inner, const PacketScalar& x);
 
     template<int LoadMode>
     PacketScalar packet(int index) const;
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 04dfb1176..e93e439e6 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -75,23 +75,6 @@ class DenseStorageBase : public _Base<Derived>
     EIGEN_STRONG_INLINE int rows() const { return m_storage.rows(); }
     EIGEN_STRONG_INLINE int cols() const { return m_storage.cols(); }
 
-    /** Returns the leading dimension (for matrices) or the increment (for vectors) to be used with data().
-      *
-      * More precisely:
-      *  - for a column major matrix it returns the number of elements between two successive columns
-      *  - for a row major matrix it returns the number of elements between two successive rows
-      *  - for a vector it returns the number of elements between two successive coefficients
-      * This function has to be used together with the MapBase::data() function.
-      *
-      * \sa data() */
-    EIGEN_STRONG_INLINE int stride() const
-    {
-      if(IsVectorAtCompileTime)
-        return 1;
-      else
-        return (Flags & RowMajorBit) ? m_storage.cols() : m_storage.rows();
-    }
-
     EIGEN_STRONG_INLINE const Scalar& coeff(int row, int col) const
     {
       if(Flags & RowMajorBit)
@@ -253,13 +236,13 @@ class DenseStorageBase : public _Base<Derived>
     {
       if(RowsAtCompileTime == 1)
       {
-        ei_assert(other.rows() == 1);
-        resize(1, other.cols());
+        ei_assert(other.rows() == 1 || other.cols() == 1);
+        resize(1, other.size());
       }
       else if(ColsAtCompileTime == 1)
       {
-        ei_assert(other.cols() == 1);
-        resize(other.rows(), 1);
+        ei_assert(other.rows() == 1 || other.cols() == 1);
+        resize(other.size(), 1);
       }
       else resize(other.rows(), other.cols());
     }
diff --git a/Eigen/src/Core/Flagged.h b/Eigen/src/Core/Flagged.h
index 7f42a1e73..9d14aceaa 100644
--- a/Eigen/src/Core/Flagged.h
+++ b/Eigen/src/Core/Flagged.h
@@ -60,7 +60,8 @@ template<typename ExpressionType, unsigned int Added, unsigned int Removed> clas
 
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
-    inline int stride() const { return m_matrix.stride(); }
+    inline int outerStride() const { return m_matrix.outerStride(); }
+    inline int innerStride() const { return m_matrix.innerStride(); }
 
     inline const Scalar coeff(int row, int col) const
     {
diff --git a/Eigen/src/Core/ForceAlignedAccess.h b/Eigen/src/Core/ForceAlignedAccess.h
index 927f43413..300b22329 100644
--- a/Eigen/src/Core/ForceAlignedAccess.h
+++ b/Eigen/src/Core/ForceAlignedAccess.h
@@ -52,7 +52,8 @@ template<typename ExpressionType> class ForceAlignedAccess
 
     inline int rows() const { return m_expression.rows(); }
     inline int cols() const { return m_expression.cols(); }
-    inline int stride() const { return m_expression.stride(); }
+    inline int outerStride() const { return m_expression.outerStride(); }
+    inline int innerStride() const { return m_expression.innerStride(); }
 
     inline const CoeffReturnType coeff(int row, int col) const
     {
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 432bf1661..d9ccb1b20 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -56,14 +56,8 @@ struct ei_traits<Map<MatrixType, Options, StrideType> >
     Flags0 = ei_traits<MatrixType>::Flags,
     Flags1 = ((Options&Aligned)==Aligned ? Flags0 |  AlignedBit
                                          : Flags0 & ~AlignedBit),
-    Flags = int(StrideType::InnerStrideAtCompileTime)==1 ? Flags1 : (Flags1 & ~PacketAccessBit),
-    InnerStrideAtCompileTime = int(StrideType::InnerStrideAtCompileTime) != 0 ? int(StrideType::InnerStrideAtCompileTime) : 1,
-    OuterStrideAtCompileTime =
-        int(StrideType::OuterStrideAtCompileTime != 0) ? int(StrideType::OuterStrideAtCompileTime)
-          : int(MatrixType::IsVectorAtCompileTime) ? int(MatrixType::SizeAtCompileTime)
-          : int(Flags)&RowMajorBit ? int(MatrixType::ColsAtCompileTime)
-          : int(MatrixType::RowsAtCompileTime)
-    };
+    Flags = int(StrideType::InnerStrideAtCompileTime)==1 ? Flags1 : (Flags1 & ~PacketAccessBit)
+  };
 };
 
 template<typename MatrixType, int Options, typename StrideType> class Map
diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h
index 6bac2ed4c..d735cfc47 100644
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@@ -38,57 +38,22 @@ template<typename Derived, typename Base> class MapBase
   public:
 
     enum {
-      IsRowMajor = (int(ei_traits<Derived>::Flags) & RowMajorBit) ? 1 : 0,
       RowsAtCompileTime = ei_traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = ei_traits<Derived>::ColsAtCompileTime,
-      SizeAtCompileTime = Base::SizeAtCompileTime,
-      InnerStrideAtCompileTime = ei_traits<Derived>::InnerStrideAtCompileTime
+      SizeAtCompileTime = Base::SizeAtCompileTime
     };
 
     typedef typename ei_traits<Derived>::Scalar Scalar;
     typedef typename Base::PacketScalar PacketScalar;
     using Base::derived;
+    using Base::innerStride;
+    using Base::outerStride;
+    using Base::rowStride;
+    using Base::colStride;
 
     inline int rows() const { return m_rows.value(); }
     inline int cols() const { return m_cols.value(); }
 
-    /** \returns the pointer increment between two consecutive elements.
-      *
-      * \note For vectors, the storage order is ignored. For matrices (non-vectors), we're looking
-      *       at the increment between two consecutive elements within a slice in the inner direction.
-      *
-      * \sa outerStride(), data(), rowStride(), colStride()
-      */
-    inline int innerStride() const { return derived().innerStride(); }
-
-    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
-      *          in a column-major matrix).
-      *
-      * \note For vectors, the storage order is ignored, there is only one inner slice, and so this method returns 1.
-      *       For matrices (non-vectors), the notion of inner slice depends on the storage order.
-      *
-      * \sa innerStride(), data(), rowStride(), colStride()
-      */
-    inline int outerStride() const { return derived().outerStride(); }
-
-    /** \returns the pointer increment between two consecutive rows.
-      *
-      * \sa data(), innerStride(), outerStride(), colStride()
-      */
-    inline int rowStride() const
-    {
-      return (RowsAtCompileTime==1 || IsRowMajor) ? outerStride() : innerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive columns.
-      *
-      * \sa data(), innerStride(), outerStride(), rowStride()
-      */
-    inline int colStride() const
-    {
-      return (RowsAtCompileTime==1 || IsRowMajor) ? innerStride() : outerStride();
-    }
-
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
       * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index e011ae8b9..b494b2f00 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -120,10 +120,7 @@ struct ei_traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
     MaxRowsAtCompileTime = _MaxRows,
     MaxColsAtCompileTime = _MaxCols,
     Flags = ei_compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (RowsAtCompileTime==1||ColsAtCompileTime==1) ? 1
-     : (int(Flags)&RowMajorBit) ? RowsAtCompileTime : ColsAtCompileTime
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 };
 
diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h
index 9f6d1c0c0..497ce828b 100644
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@@ -53,7 +53,8 @@ template<typename ExpressionType> class NestByValue
 
     inline int rows() const { return m_expression.rows(); }
     inline int cols() const { return m_expression.cols(); }
-    inline int stride() const { return m_expression.stride(); }
+    inline int outerStride() const { return m_expression.outerStride(); }
+    inline int innerStride() const { return m_expression.innerStride(); }
 
     inline const CoeffReturnType coeff(int row, int col) const
     {
diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 53277169c..af05773ee 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -336,7 +336,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     ei_cache_friendly_product_colmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
       dest.size(),
-      &actualLhs.const_cast_derived().coeffRef(0,0), ei_outer_stride_or_outer_size(actualLhs),
+      &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
       actualRhs, actualDest, actualAlpha);
 
     if (!EvalToDest)
@@ -381,7 +381,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
 
     ei_cache_friendly_product_rowmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
-        &actualLhs.const_cast_derived().coeffRef(0,0), ei_outer_stride_or_outer_size(actualLhs),
+        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
         rhs_data, prod.rhs().size(), dest, actualAlpha);
 
     if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index 6d01ee495..add5a3afb 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -75,8 +75,9 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
 
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
-    inline int stride() const { return m_matrix.stride(); }
-
+    inline int outerStride() const { return m_matrix.outerStride(); }
+    inline int innerStride() const { return m_matrix.innerStride(); }
+    
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
       */
diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 7ae2e82a4..58aee182d 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -57,7 +57,8 @@ template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
 
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
-    inline int stride() const { return m_matrix.stride(); }
+    inline int outerStride() const { return m_matrix.outerStride(); }
+    inline int innerStride() const { return m_matrix.innerStride(); }
     inline const Scalar* data() const { return m_matrix.data(); }
 
     // note that this function is needed by assign to correctly align loads/stores
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index ba0b19de3..7982035fd 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -54,20 +54,6 @@ class Stride
     inline int inner() const { return m_inner.value(); }
     inline int outer() const { return m_outer.value(); }
 
-    template<int OtherInnerStrideAtCompileTime, int OtherOuterStrideAtCompileTime>
-    Stride<EIGEN_ENUM_MAX(InnerStrideAtCompileTime, OtherInnerStrideAtCompileTime),
-           EIGEN_ENUM_MAX(OuterStrideAtCompileTime, OtherOuterStrideAtCompileTime)>
-    operator|(const Stride<OtherInnerStrideAtCompileTime, OtherOuterStrideAtCompileTime>& other)
-    {
-      EIGEN_STATIC_ASSERT(!((InnerStrideAtCompileTime && OtherInnerStrideAtCompileTime)
-                         || (OuterStrideAtCompileTime && OtherOuterStrideAtCompileTime)),
-                          YOU_ALREADY_SPECIFIED_THIS_STRIDE)
-      int result_inner = InnerStrideAtCompileTime ? inner() : other.inner();
-      int result_outer = OuterStrideAtCompileTime ? outer() : other.outer();
-      return Stride<EIGEN_ENUM_MAX(InnerStrideAtCompileTime, OtherInnerStrideAtCompileTime),
-                    EIGEN_ENUM_MAX(OuterStrideAtCompileTime, OtherOuterStrideAtCompileTime)>
-                    (result_inner, result_outer);
-    }
   protected:
     ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
     ei_int_if_dynamic<OuterStrideAtCompileTime> m_outer;
@@ -91,46 +77,4 @@ class OuterStride : public Stride<0, Value>
     OuterStride(int v) : Base(0,v) {}
 };
 
-template<typename T, bool HasDirectAccess = int(ei_traits<T>::Flags)&DirectAccessBit>
-struct ei_outer_stride_or_outer_size_impl
-{
-  static inline int value(const T& x) { return x.outerStride(); }
-};
-
-template<typename T>
-struct ei_outer_stride_or_outer_size_impl<T, false>
-{
-  static inline int value(const T& x) { return x.outerSize(); }
-};
-
-template<typename T>
-inline int ei_outer_stride_or_outer_size(const T& x)
-{
-  return ei_outer_stride_or_outer_size_impl<T>::value(x);
-}
-
-template<typename T, bool HasDirectAccess = int(ei_traits<typename ei_cleantype<T>::type>::Flags)&DirectAccessBit>
-struct ei_inner_stride_at_compile_time
-{
-  enum { ret = ei_traits<typename ei_cleantype<T>::type>::InnerStrideAtCompileTime };
-};
-
-template<typename T>
-struct ei_inner_stride_at_compile_time<T, false>
-{
-  enum { ret = 1 };
-};
-
-template<typename T, bool HasDirectAccess = int(ei_traits<typename ei_cleantype<T>::type>::Flags)&DirectAccessBit>
-struct ei_outer_stride_at_compile_time
-{
-  enum { ret = ei_traits<typename ei_cleantype<T>::type>::OuterStrideAtCompileTime };
-};
-
-template<typename T>
-struct ei_outer_stride_at_compile_time<T, false>
-{
-  enum { ret = 1 };
-};
-
 #endif // EIGEN_STRIDE_H
diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h
index 186268af0..c3c641097 100644
--- a/Eigen/src/Core/Swap.h
+++ b/Eigen/src/Core/Swap.h
@@ -47,7 +47,8 @@ template<typename ExpressionType> class SwapWrapper
 
     inline int rows() const { return m_expression.rows(); }
     inline int cols() const { return m_expression.cols(); }
-    inline int stride() const { return m_expression.stride(); }
+    inline int outerStride() const { return m_expression.outerStride(); }
+    inline int innerStride() const { return m_expression.innerStride(); }
 
     inline Scalar& coeffRef(int row, int col)
     {
@@ -60,7 +61,7 @@ template<typename ExpressionType> class SwapWrapper
     }
 
     template<typename OtherDerived>
-    void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other)
+    void copyCoeff(int row, int col, const DenseBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(row >= 0 && row < rows()
@@ -71,7 +72,7 @@ template<typename ExpressionType> class SwapWrapper
     }
 
     template<typename OtherDerived>
-    void copyCoeff(int index, const MatrixBase<OtherDerived>& other)
+    void copyCoeff(int index, const DenseBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(index >= 0 && index < m_expression.size());
@@ -81,7 +82,7 @@ template<typename ExpressionType> class SwapWrapper
     }
 
     template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
+    void copyPacket(int row, int col, const DenseBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(row >= 0 && row < rows()
@@ -94,7 +95,7 @@ template<typename ExpressionType> class SwapWrapper
     }
 
     template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(int index, const MatrixBase<OtherDerived>& other)
+    void copyPacket(int index, const DenseBase<OtherDerived>& other)
     {
       OtherDerived& _other = other.const_cast_derived();
       ei_internal_assert(index >= 0 && index < m_expression.size());
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 753a67ee0..47dae5776 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -80,9 +80,6 @@ template<typename MatrixType> class Transpose
     typename ei_cleantype<typename MatrixType::Nested>::type&
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
-    enum { InnerStrideAtCompileTime = ei_inner_stride_at_compile_time<MatrixType>::ret,
-           OuterStrideAtCompileTime = ei_outer_stride_at_compile_time<MatrixType>::ret };
-
   protected:
     const typename MatrixType::Nested m_matrix;
 };
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 8bea0aa68..c61a6d7cc 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -50,7 +50,8 @@ template<typename Derived> class TriangularBase : public AnyMatrixBase<Derived>
 
     inline int rows() const { return derived().rows(); }
     inline int cols() const { return derived().cols(); }
-    inline int stride() const { return derived().stride(); }
+    inline int outerStride() const { return derived().outerStride(); }
+    inline int innerStride() const { return derived().innerStride(); }
 
     inline Scalar coeff(int row, int col) const  { return derived().coeff(row,col); }
     inline Scalar& coeffRef(int row, int col) { return derived().coeffRef(row,col); }
@@ -165,7 +166,8 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
 
     inline int rows() const { return m_matrix.rows(); }
     inline int cols() const { return m_matrix.cols(); }
-    inline int stride() const { return m_matrix.stride(); }
+    inline int outerStride() const { return m_matrix.outerStride(); }
+    inline int innerStride() const { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::operator+=() */
     template<typename Other> TriangularView&  operator+=(const Other& other) { return *this = m_matrix + other; }
diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h
index cbf97aeb3..5bb7fd35d 100644
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@@ -86,7 +86,6 @@ template<typename VectorType, int Size> class VectorBlock
              IsColVector ? start : 0, IsColVector ? 0 : start,
              IsColVector ? size  : 1, IsColVector ? 1 : size)
     {
-
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
     }
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 03c77cc78..beec17ee4 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -147,7 +147,6 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
 
       const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
       const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
-      ei_assert(ei_inner_stride_at_compile_time<ActualLhsType>::ret == 1);
 
       Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
                                  * RhsBlasTraits::extractScalarFactor(m_rhs);
@@ -159,9 +158,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>
       ::run(
           this->rows(), this->cols(), lhs.cols(),
-          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), ei_outer_stride_or_outer_size(lhs),
-          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), ei_outer_stride_or_outer_size(rhs),
-          (Scalar*)&(dst.coeffRef(0,0)), ei_outer_stride_or_outer_size(dst),
+          (const Scalar*)&(lhs.const_cast_derived().coeffRef(0,0)), lhs.stride(),
+          (const Scalar*)&(rhs.const_cast_derived().coeffRef(0,0)), rhs.stride(),
+          (Scalar*)&(dst.coeffRef(0,0)), dst.stride(),
           actualAlpha);
     }
 };
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index 51590b03d..c27c979a6 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -140,7 +140,7 @@ const unsigned int LinearAccessBit = 0x10;
   * Means that the underlying array of coefficients can be directly accessed. This means two things.
   * First, references to the coefficients must be available through coeffRef(int, int). This rules out read-only
   * expressions whose coefficients are computed on demand by coeff(int, int). Second, the memory layout of the
-  * array of coefficients must be exactly the natural one suggested by rows(), cols(), stride(), and the RowMajorBit.
+  * array of coefficients must be exactly the natural one suggested by rows(), cols(), outerStride(), innerStride(), and the RowMajorBit.
   * This rules out expressions such as Diagonal, whose coefficients, though referencable, do not have
   * such a regular memory layout.
   */
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index d4920d213..c7b95d334 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -232,7 +232,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   enum { PacketSize = ei_packet_traits<Scalar>::size,
          PacketAlignedMask = PacketSize-1
   };
-  
+
   if(PacketSize==1)
   {
     // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 809e4aad6..3925ac1b0 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -368,7 +368,7 @@ void ei_partial_lu_inplace(MatrixType& lu, IntVector& row_transpositions, int& n
 
   ei_partial_lu_impl
     <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor>
-    ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.stride(), &row_transpositions.coeffRef(0), nb_transpositions);
+    ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
 template<typename MatrixType>
diff --git a/Eigen/src/Sparse/CholmodSupport.h b/Eigen/src/Sparse/CholmodSupport.h
index fd33b1507..248f56533 100644
--- a/Eigen/src/Sparse/CholmodSupport.h
+++ b/Eigen/src/Sparse/CholmodSupport.h
@@ -99,7 +99,7 @@ cholmod_dense ei_cholmod_map_eigen_to_dense(MatrixBase<Derived>& mat)
   res.nrow   = mat.rows();
   res.ncol   = mat.cols();
   res.nzmax  = res.nrow * res.ncol;
-  res.d      = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().stride();
+  res.d      = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().outerStride();
   res.x      = mat.derived().data();
   res.z      = 0;
 
diff --git a/Eigen/src/Sparse/SuperLUSupport.h b/Eigen/src/Sparse/SuperLUSupport.h
index 1a765c75b..9a5bec554 100644
--- a/Eigen/src/Sparse/SuperLUSupport.h
+++ b/Eigen/src/Sparse/SuperLUSupport.h
@@ -161,7 +161,7 @@ struct SluMatrix : SuperMatrix
     res.nrow      = mat.rows();
     res.ncol      = mat.cols();
 
-    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.stride();
+    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
     res.storage.values    = mat.data();
     return res;
   }
@@ -217,7 +217,7 @@ struct SluMatrixMapHelper<Matrix<Scalar,Rows,Cols,Options,MRows,MCols> >
     res.nrow      = mat.rows();
     res.ncol      = mat.cols();
 
-    res.storage.lda       = mat.stride();
+    res.storage.lda       = mat.outerStride();
     res.storage.values    = mat.data();
   }
 };
diff --git a/test/submatrices.cpp b/test/submatrices.cpp
index a9dcf8476..e71c28ceb 100644
--- a/test/submatrices.cpp
+++ b/test/submatrices.cpp
@@ -217,11 +217,11 @@ void data_and_stride(const MatrixType& m)
 
   MatrixType m1 = MatrixType::Random(rows, cols);
   compare_using_data_and_stride(m1.block(r1, c1, r2-r1+1, c2-c1+1));
-  //compare_using_data_and_stride(m1.transpose().block(c1, r1, c2-c1+1, r2-r1+1));
+  compare_using_data_and_stride(m1.transpose().block(c1, r1, c2-c1+1, r2-r1+1));
   compare_using_data_and_stride(m1.row(r1));
   compare_using_data_and_stride(m1.col(c1));
-  //compare_using_data_and_stride(m1.row(r1).transpose());
-  //compare_using_data_and_stride(m1.col(c1).transpose());
+  compare_using_data_and_stride(m1.row(r1).transpose());
+  compare_using_data_and_stride(m1.col(c1).transpose());
 }
 
 void test_submatrices()

From f56ac04c34e3ccefa2313d41b7a93f3f94f9d07e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 25 Feb 2010 21:24:42 -0500
Subject: [PATCH 033/122] DenseBase::IsRowMajor now takes the special case of
 vectors into account.

---
 Eigen/src/Array/Reverse.h  |  2 +-
 Eigen/src/Core/Assign.h    |  4 +---
 Eigen/src/Core/DenseBase.h | 16 +++++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/Array/Reverse.h b/Eigen/src/Array/Reverse.h
index a405fbb4b..fe7de53b6 100644
--- a/Eigen/src/Array/Reverse.h
+++ b/Eigen/src/Array/Reverse.h
@@ -85,7 +85,7 @@ template<typename MatrixType, int Direction> class Reverse
   protected:
     enum {
       PacketSize = ei_packet_traits<Scalar>::size,
-      IsRowMajor = Flags & RowMajorBit,
+      IsRowMajor = MatrixType::IsRowMajor,
       IsColMajor = !IsRowMajor,
       ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
       ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 3133aa03a..99d497449 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -55,9 +55,7 @@ private:
   };
 
   enum {
-    LhsIsEffectivelyRowMajor = (Derived::RowsAtCompileTime==1) || (int(Derived::Flags)&RowMajorBit),
-    RhsIsEffectivelyRowMajor = (OtherDerived::RowsAtCompileTime==1) || (int(OtherDerived::Flags)&RowMajorBit),
-    StorageOrdersAgree = (LhsIsEffectivelyRowMajor == RhsIsEffectivelyRowMajor),
+    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
     MightVectorize = StorageOrdersAgree
                   && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
     MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 5682d7278..67540bd8c 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -124,7 +124,11 @@ template<typename Derived> class DenseBase
           * constructed from this one. See the \ref flags "list of flags".
           */
 
-      IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression is row major. */
+      IsRowMajor = RowsAtCompileTime==1 ? 1
+                 : ColsAtCompileTime==1 ? 0
+                 : int(Flags) & RowMajorBit, /**< True if this expression has row-major effective addressing.
+                   For non-vectors, it is like reading the RowMajorBit on the Flags. For vectors, this is
+                   overriden by the convention that row-vectors are row-major and column-vectors are column-major. */
 
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? SizeAtCompileTime
                              : int(Flags)&RowMajorBit ? ColsAtCompileTime : RowsAtCompileTime,
@@ -245,10 +249,7 @@ template<typename Derived> class DenseBase
       */
     inline int rowStride() const
     {
-      return ColsAtCompileTime==1 ? innerStride()
-           : RowsAtCompileTime==1 ? outerStride()
-           : IsRowMajor ? outerStride()
-           : innerStride();
+      return IsRowMajor ? outerStride() : innerStride();
     }
 
     /** \returns the pointer increment between two consecutive columns.
@@ -257,10 +258,7 @@ template<typename Derived> class DenseBase
       */
     inline int colStride() const
     {
-      return ColsAtCompileTime==1 ? outerStride()
-           : RowsAtCompileTime==1 ? innerStride()
-           : IsRowMajor ? innerStride()
-           : outerStride();
+      return IsRowMajor ? innerStride() : outerStride();
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN

From d86f5339b2033af792fa60176f8060938e9599ec Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Fri, 26 Feb 2010 09:47:17 +0000
Subject: [PATCH 034/122] ComplexSchur: fix bug introduced in my previous
 commit. The value of c is actually used a few lines later.

---
 Eigen/src/Eigenvalues/ComplexSchur.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 0fad415a2..531ebf709 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -218,10 +218,12 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
     sf = t.cwiseAbs().sum();
     t /= sf;     // the normalization by sf is to avoid under/overflow
 
-    b = t.coeff(0,0) + t.coeff(1,1);
+    b = t.coeff(0,1) * t.coeff(1,0);
     c = t.coeff(0,0) - t.coeff(1,1);
-    disc = ei_sqrt(c*c + RealScalar(4)*t.coeff(0,1)*t.coeff(1,0));
+    disc = ei_sqrt(c*c + RealScalar(4)*b);
 
+    c = t.coeff(0,0) * t.coeff(1,1) - b;
+    b = t.coeff(0,0) + t.coeff(1,1);
     r1 = (b+disc)/RealScalar(2);
     r2 = (b-disc)/RealScalar(2);
 

From 3ac2b96a2f131e8162d39f0976cfb31b1a853237 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 12:32:00 +0100
Subject: [PATCH 035/122] implement a smarter parallelization strategy for gemm
 avoiding multiple paking of the same data

---
 .../Core/products/GeneralBlockPanelKernel.h   |  10 +-
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 148 +++++++++++++-----
 Eigen/src/Core/products/Parallelizer.h        |  64 +++++++-
 bench/bench_gemm.cpp                          |  79 +++++++++-
 bench/bench_gemm_blas.cpp                     |  83 ----------
 5 files changed, 246 insertions(+), 138 deletions(-)
 delete mode 100644 bench/bench_gemm_blas.cpp

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index c29e4efc2..6836a10de 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -37,7 +37,8 @@
 template<typename Scalar, int mr, int nr, typename Conj>
 struct ei_gebp_kernel
 {
-  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols, int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0)
+  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int rows, int depth, int cols,
+                  int strideA=-1, int strideB=-1, int offsetA=0, int offsetB=0, Scalar* unpackedB = 0)
   {
     typedef typename ei_packet_traits<Scalar>::type PacketType;
     enum { PacketSize = ei_packet_traits<Scalar>::size };
@@ -45,11 +46,12 @@ struct ei_gebp_kernel
     if(strideB==-1) strideB = depth;
     Conj cj;
     int packet_cols = (cols/nr) * nr;
-    const int peeled_mc  = (rows/mr)*mr;
-    const int peeled_mc2  = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
+    const int peeled_mc = (rows/mr)*mr;
+    const int peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0);
     const int peeled_kc = (depth/4)*4;
 
-    Scalar* unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
+    if(unpackedB==0)
+      unpackedB = const_cast<Scalar*>(blockB - strideB * nr * PacketSize);
 
     // loops on each micro vertical panel of rhs (depth x nr)
     for(int j2=0; j2<packet_cols; j2+=nr)
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 84429a0d9..ca3d1d200 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -39,7 +39,8 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
     const Scalar* lhs, int lhsStride,
     const Scalar* rhs, int rhsStride,
     Scalar* res, int resStride,
-    Scalar alpha)
+    Scalar alpha,
+    GemmParallelInfo* info = 0)
   {
     // transpose the product such that the result is column major
     ei_general_matrix_matrix_product<Scalar,
@@ -48,7 +49,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
       LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
       ConjugateLhs,
       ColMajor>
-    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,info);
   }
 };
 
@@ -64,7 +65,8 @@ static void run(int rows, int cols, int depth,
   const Scalar* _lhs, int lhsStride,
   const Scalar* _rhs, int rhsStride,
   Scalar* res, int resStride,
-  Scalar alpha)
+  Scalar alpha,
+  GemmParallelInfo* info = 0)
 {
   ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
   ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
@@ -75,47 +77,114 @@ static void run(int rows, int cols, int depth,
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
-  int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
-  int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
+//   int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
+//   int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
-  Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-  std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
-  Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
-  Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+  int kc = std::min<int>(256,depth);  // cache block size along the K direction
+  int mc = std::min<int>(512,rows);   // cache block size along the M direction
 
-  // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-  // (==GEMM_VAR1)
-  for(int k2=0; k2<depth; k2+=kc)
+  ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
+  ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
+  ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
+
+  if(info)
   {
-    const int actual_kc = std::min(k2+kc,depth)-k2;
+    // this is the parallel version!
+    int tid = omp_get_thread_num();
 
-    // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-    // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-    // Note that this panel will be read as many times as the number of blocks in the lhs's
-    // vertical panel which is, in practice, a very low number.
-    ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder>()(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+    std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
+    Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
+    Scalar* blockB = (Scalar*)info[tid].blockB;
 
-    // For each mc x kc block of the lhs's vertical panel...
-    // (==GEPP_VAR1)
-    for(int i2=0; i2<rows; i2+=mc)
+    // if you have the GOTO blas library you can try our parallelization strategy
+    // using GOTO's optimized routines.
+//     #define USEGOTOROUTINES
+    #ifdef USEGOTOROUTINES
+    void* u = alloca(4096+sizeW);
+    #endif
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
+    for(int k=0; k<depth; k+=kc)
     {
-      const int actual_mc = std::min(i2+mc,rows)-i2;
+      #pragma omp barrier
+      const int actual_kc = std::min(k+kc,depth)-k;
 
-      // We pack the lhs's block into a sequential chunk of memory (L1 caching)
-      // Note that this block will be read a very high number of times, which is equal to the number of
-      // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-      ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder>()(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+      // pack B_k to B' in parallel fashion,
+      // each thread packs the B_k,j sub block to B'_j where j is the thread id
+      #ifndef USEGOTOROUTINES
+      pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
+      #else
+      sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
+      #endif
 
-      // Everything is packed, we can now call the block * panel kernel:
-      ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> >()
-        (res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+      #pragma omp barrier
 
-//         sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, allocatedBlockB, res+i2, resStride);
+      for(int i=0; i<rows; i+=mc)
+      {
+        const int actual_mc = std::min(i+mc,rows)-i;
+
+        // pack A_i,k to A'
+        #ifndef USEGOTOROUTINES
+        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        #else
+        sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA);
+        #endif
+
+        // C_i += A' * B'
+        #ifndef USEGOTOROUTINES
+        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
+        #else
+        sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
+        #endif
+      }
     }
-  }
 
-  ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-  ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+    ei_aligned_stack_delete(Scalar, w, sizeW);
+  }
+  else
+  {
+    // this is the sequential version!
+    Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+
+    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+    // (==GEMM_VAR1)
+    for(int k2=0; k2<depth; k2+=kc)
+    {
+      const int actual_kc = std::min(k2+kc,depth)-k2;
+
+      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+      // Note that this panel will be read as many times as the number of blocks in the lhs's
+      // vertical panel which is, in practice, a very low number.
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+
+
+      // For each mc x kc block of the lhs's vertical panel...
+      // (==GEPP_VAR1)
+      for(int i2=0; i2<rows; i2+=mc)
+      {
+        const int actual_mc = std::min(i2+mc,rows)-i2;
+
+        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+        // Note that this block will be read a very high number of times, which is equal to the number of
+        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+
+        // Everything is packed, we can now call the block * panel kernel:
+        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
+
+      }
+    }
+
+    ei_aligned_stack_delete(Scalar, blockA, kc*mc);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
+  }
 }
 
 };
@@ -139,15 +208,16 @@ struct ei_gemm_functor
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
   {}
 
-  void operator() (int col, int cols, int row=0, int rows=-1) const
+  void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const
   {
-    if(rows==-1)
-      rows = m_lhs.rows();
+    if(cols==-1)
+      cols = m_rhs.cols();
     Gemm::run(rows, cols, m_lhs.cols(),
               (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
               (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
               (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
-              m_actualAlpha);
+              m_actualAlpha,
+              info);
   }
 
   protected:
@@ -191,7 +261,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         _ActualRhsType,
         Dest> Functor;
 
-      ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->cols(), this->rows());
+//       ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows());
+//       ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+      ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 088e387f9..e7a940992 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,6 +25,13 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
+struct GemmParallelInfo
+{
+  int rhs_start;
+  int rhs_length;
+  float* blockB;
+};
+
 template<bool Parallelize,typename Functor>
 void ei_run_parallel_1d(const Functor& func, int size)
 {
@@ -53,18 +60,21 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 #ifndef EIGEN_HAS_OPENMP
   func(0,size1, 0,size2);
 #else
-  if(!Parallelize)
+
+  int threads = omp_get_max_threads();
+  if((!Parallelize)||(threads==1))
     return func(0,size1, 0,size2);
 
                                 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5, 11,  4, 13,  7,  5, 4};
-  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2,  1,  3,  1,  2,  3, 4};
+  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5,  1,  4,  1,  7,  5, 4};
+  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2, 11,  3, 13,  2,  3, 4};
+
+
 
-  int threads = omp_get_num_procs();
   ei_assert(threads<=16 && "too many threads !");
   int blockSize1 = size1 / divide1[threads];
   int blockSize2 = size2 / divide2[threads];
-  
+
   Matrix<int,4,Dynamic> ranges(4,threads);
   int k = 0;
   for(int i1=0; i1<divide1[threads]; ++i1)
@@ -78,7 +88,7 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
       ranges.col(k++) << blockStart1, actualBlockSize1, blockStart2, actualBlockSize2;
     }
   }
-  
+
   #pragma omp parallel for schedule(static,1)
   for(int i=0; i<threads; ++i)
   {
@@ -87,4 +97,44 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 #endif
 }
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
+template<bool Parallelize,typename Functor>
+void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
+{
+#ifndef EIGEN_HAS_OPENMP
+  func(0,size1, 0,size2);
+#else
+
+  int threads = omp_get_max_threads();
+  if((!Parallelize)||(threads==1))
+    return func(0,rows, 0,cols);
+
+
+  int blockCols = (cols / threads) & ~0x3;
+  int blockRows = (rows / threads) & ~0x7;
+
+  float* sharedBlockB = new float[2048*2048*4];
+
+  GemmParallelInfo* info = new GemmParallelInfo[threads];
+
+  #pragma omp parallel for schedule(static,1)
+  for(int i=0; i<threads; ++i)
+  {
+    int r0 = i*blockRows;
+    int actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+
+    int c0 = i*blockCols;
+    int actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+
+    info[i].rhs_start = c0;
+    info[i].rhs_length = actualBlockCols;
+    info[i].blockB = sharedBlockB;
+
+    func(r0, actualBlockRows, 0,cols, info);
+  }
+
+  delete[] sharedBlockB;
+  delete[] info;
+#endif
+}
+
+#endif // EIGEN_PARALLELIZER_H
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index ccc155dc5..d958cc1bf 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -15,6 +15,52 @@ using namespace Eigen;
 typedef SCALAR Scalar;
 typedef Matrix<Scalar,Dynamic,Dynamic> M;
 
+#ifdef HAVE_BLAS
+
+extern "C" {
+  #include <bench/btl/libs/C_BLAS/blas.h>
+
+  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
+                    float* blockA, float* blockB, float* res, int resStride);
+  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+}
+
+static float fone = 1;
+static float fzero = 0;
+static double done = 1;
+static double szero = 0;
+static char notrans = 'N';
+static char trans = 'T';
+static char nonunit = 'N';
+static char lower = 'L';
+static char right = 'R';
+static int intone = 1;
+
+void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
+{
+  int M = c.rows(); int N = c.cols(); int K = a.cols();
+  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+
+  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+         const_cast<float*>(a.data()),&lda,
+         const_cast<float*>(b.data()),&ldb,&fone,
+         c.data(),&ldc);
+}
+
+void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
+{
+  int M = c.rows(); int N = c.cols(); int K = a.cols();
+  int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows();
+
+  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
+         const_cast<double*>(a.data()),&lda,
+         const_cast<double*>(b.data()),&ldb,&done,
+         c.data(),&ldc);
+}
+
+#endif
+
 void gemm(const M& a, const M& b, M& c)
 {
   c.noalias() += a * b;
@@ -22,21 +68,42 @@ void gemm(const M& a, const M& b, M& c)
 
 int main(int argc, char ** argv)
 {
-  int rep = 1;
+  int rep = 1;    // number of repetitions per try
+  int tries = 5;  // number of tries, we keep the best
+
   int s = 2048;
   int m = s;
   int n = s;
   int p = s;
-  M a(m,n); a.setOnes();
-  M b(n,p); b.setOnes();
+  M a(m,n); a.setRandom();
+  M b(n,p); b.setRandom();
   M c(m,p); c.setOnes();
 
   BenchTimer t;
 
-  BENCH(t, 5, rep, gemm(a,b,c));
+  M r = c;
 
-  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  // check the parallel product is correct
+  #ifdef HAVE_BLAS
+  blas_gemm(a,b,r);
+  #else
+  int procs = omp_get_max_threads();
+  omp_set_num_threads(1);
+  r.noalias() += a * b;
+  omp_set_num_threads(procs);
+  #endif
+  c.noalias() += a * b;
+  if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
+
+  #ifdef HAVE_BLAS
+  BENCH(t, tries, rep, blas_gemm(a,b,c));
+  std::cerr << "blas  cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "blas  real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  #endif
+
+  BENCH(t, tries, rep, gemm(a,b,c));
+  std::cerr << "eigen cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "eigen real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
 
   return 0;
 }
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
deleted file mode 100644
index babf1ec2c..000000000
--- a/bench/bench_gemm_blas.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-
-#include <Eigen/Core>
-#include <bench/BenchTimer.h>
-
-extern "C"
-{
-  #include <bench/btl/libs/C_BLAS/blas.h>
-  #include <cblas.h>
-}
-
-using namespace std;
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR float
-#endif
-
-typedef SCALAR Scalar;
-typedef Matrix<Scalar,Dynamic,Dynamic> M;
-
-static float fone = 1;
-static float fzero = 0;
-static double done = 1;
-static double szero = 0;
-static char notrans = 'N';
-static char trans = 'T';
-static char nonunit = 'N';
-static char lower = 'L';
-static char right = 'R';
-static int intone = 1;
-
-void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-  
-  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
-         const_cast<float*>(a.data()),&lda,
-         const_cast<float*>(b.data()),&ldb,&fone,
-         c.data(),&ldc);
-}
-
-void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-
-  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
-         const_cast<double*>(a.data()),&lda,
-         const_cast<double*>(b.data()),&ldb,&done,
-         c.data(),&ldc);
-}
-
-int main(int argc, char **argv)
-{
-  int rep = 1;
-  int s = 2048;
-  int m = s;
-  int n = s;
-  int p = s;
-  M a(m,n); a.setOnes();
-  M b(n,p); b.setOnes();
-  M c(m,p); c.setOnes();
-
-  BenchTimer t;
-
-  BENCH(t, 5, rep, blas_gemm(a,b,c));
-
-  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
-  return 0;
-}
-

From c05047d28e7d84d2a5312c3c958063d4415b6dcc Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 12:51:20 +0100
Subject: [PATCH 036/122] fix some BTL issues

---
 bench/btl/actions/action_lu_decomp.hh     |  2 +-
 bench/btl/data/action_settings.txt        |  2 +-
 bench/btl/data/go_mean                    |  2 +-
 bench/btl/libs/eigen2/eigen2_interface.hh | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/bench/btl/actions/action_lu_decomp.hh b/bench/btl/actions/action_lu_decomp.hh
index 147884e76..93de7866b 100644
--- a/bench/btl/actions/action_lu_decomp.hh
+++ b/bench/btl/actions/action_lu_decomp.hh
@@ -76,7 +76,7 @@ public :
 
   static inline std::string name( void )
   {
-    return "lu_decomp_"+Interface::name();
+    return "complete_lu_decomp_"+Interface::name();
   }
 
   double nb_op_base( void ){
diff --git a/bench/btl/data/action_settings.txt b/bench/btl/data/action_settings.txt
index 9bee1651c..b7382ec43 100644
--- a/bench/btl/data/action_settings.txt
+++ b/bench/btl/data/action_settings.txt
@@ -8,7 +8,7 @@ matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000
 trisolve ; "{/*1.5 triangular solver (X = inv(L) X)}" ; "size" ; 4:3000
 matrix_trisolve ; "{/*1.5 matrix triangular solver (M = inv(L) M)}" ; "size" ; 4:3000
 cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000
-lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
+complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
 partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000
 tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000
 hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000
diff --git a/bench/btl/data/go_mean b/bench/btl/data/go_mean
index 4e4fd295a..fdc8d8c75 100755
--- a/bench/btl/data/go_mean
+++ b/bench/btl/data/go_mean
@@ -41,7 +41,7 @@ source mk_mean_script.sh ata $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh trisolve $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh matrix_trisolve $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh cholesky $1 11 100 300 1000 $mode $prefix
-source mk_mean_script.sh lu_decomp $1 11 100 300 1000 $mode $prefix
+source mk_mean_script.sh complete_lu_decomp $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh partial_lu_decomp $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh tridiagonalization $1 11 100 300 1000 $mode $prefix
 source mk_mean_script.sh hessenberg $1 11 100 300 1000 $mode $prefix
diff --git a/bench/btl/libs/eigen2/eigen2_interface.hh b/bench/btl/libs/eigen2/eigen2_interface.hh
index a8b5b884f..d8d1a607f 100644
--- a/bench/btl/libs/eigen2/eigen2_interface.hh
+++ b/bench/btl/libs/eigen2/eigen2_interface.hh
@@ -166,7 +166,7 @@ public :
   }
 
   static EIGEN_DONT_INLINE void rot(gene_vector & A,  gene_vector & B, real c, real s, int N){
-    ei_apply_rotation_in_the_plane(A, B, c, s);
+    ei_apply_rotation_in_the_plane(A, B, PlanarRotation<real>(c,s));
   }
 
   static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
@@ -207,15 +207,15 @@ public :
   }
 
   static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
+    C = X.fullPivLu().matrixLU();
+  }
+
+  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
     RowVectorXi piv(N);
     int nb;
     C = X;
     ei_partial_lu_inplace(C,piv,nb);
-    //C = X.lu().matrixLU();
-  }
-
-  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
-    C = X.partialPivLu().matrixLU();
+//     C = X.partialPivLu().matrixLU();
   }
 
   static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){

From 8d4a0e6753117cbabf80e4b6fa13d3d3b6ba0327 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 14:57:22 +0100
Subject: [PATCH 037/122] fix compilation without openmp

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 ++
 Eigen/src/Core/products/Parallelizer.h        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index ca3d1d200..6f7dee743 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -87,6 +87,7 @@ static void run(int rows, int cols, int depth,
   ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
   ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
 
+  #ifdef EIGEN_HAS_OPENMP
   if(info)
   {
     // this is the parallel version!
@@ -145,6 +146,7 @@ static void run(int rows, int cols, int depth,
     ei_aligned_stack_delete(Scalar, w, sizeW);
   }
   else
+  #endif
   {
     // this is the sequential version!
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index e7a940992..ad998572b 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -101,7 +101,7 @@ template<bool Parallelize,typename Functor>
 void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
 {
 #ifndef EIGEN_HAS_OPENMP
-  func(0,size1, 0,size2);
+  func(0,rows, 0,cols);
 #else
 
   int threads = omp_get_max_threads();

From ac425090f389e34f9aee71b5957cca529ac74a38 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 14:57:49 +0100
Subject: [PATCH 038/122] BTL: allow to bench real time

---
 bench/bench_gemm.cpp                          |   5 +-
 bench/bench_gemm_blas.cpp                     | 109 ++++++++++++++++++
 bench/btl/generic_bench/bench_parameter.hh    |   2 +-
 bench/btl/generic_bench/btl.hh                |  21 +++-
 .../timers/portable_perf_analyzer.hh          |   2 +-
 .../generic_bench/timers/portable_timer.hh    |  44 ++-----
 6 files changed, 142 insertions(+), 41 deletions(-)
 create mode 100644 bench/bench_gemm_blas.cpp

diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index d958cc1bf..c7a3db619 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -3,6 +3,7 @@
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
 #include <Eigen/Core>
+
 #include <bench/BenchTimer.h>
 
 using namespace std;
@@ -68,10 +69,10 @@ void gemm(const M& a, const M& b, M& c)
 
 int main(int argc, char ** argv)
 {
-  int rep = 1;    // number of repetitions per try
+  int rep = 2048;    // number of repetitions per try
   int tries = 5;  // number of tries, we keep the best
 
-  int s = 2048;
+  int s = 512;
   int m = s;
   int n = s;
   int p = s;
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
new file mode 100644
index 000000000..254302312
--- /dev/null
+++ b/bench/bench_gemm_blas.cpp
@@ -0,0 +1,109 @@
+
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+
+extern "C"
+{
+  #include <bench/btl/libs/C_BLAS/blas.h>
+  #include <cblas.h>
+
+  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
+                   float* blockA, float* blockB, float* res, int resStride);
+
+  void sgemm_otcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+  void sgemm_incopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
+}
+
+using namespace std;
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+typedef Matrix<Scalar,Dynamic,Dynamic> M;
+
+static float fone = 1;
+static float fzero = 0;
+static double done = 1;
+static double szero = 0;
+static char notrans = 'N';
+static char trans = 'T';
+static char nonunit = 'N';
+static char lower = 'L';
+static char right = 'R';
+static int intone = 1;
+
+void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
+{
+  int M = c.rows();
+  int N = c.cols();
+  int K = a.cols();
+
+  int lda = a.rows();
+  int ldb = b.rows();
+  int ldc = c.rows();
+
+//   c.noalias() += a * b;
+  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
+         const_cast<float*>(a.data()),&lda,
+         const_cast<float*>(b.data()),&ldb,&fone,
+         c.data(),&ldc);
+}
+
+void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
+{
+  int M = c.rows();
+  int N = c.cols();
+  int K = a.cols();
+
+  int lda = a.rows();
+  int ldb = b.rows();
+  int ldc = c.rows();
+
+//   c.noalias() += a * b;
+
+  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
+         const_cast<double*>(a.data()),&lda,
+         const_cast<double*>(b.data()),&ldb,&done,
+         c.data(),&ldc);
+}
+
+int main(int argc, char **argv)
+{
+  int rep = 1;
+  int s = 2048;
+  int m = s;
+  int n = s;
+  int p = s;
+  const int N = 1;
+  M a[N];
+  M b[N];
+  M c[N];
+
+  for (int k=0; k<N; ++k)
+  {
+    a[k].resize(m,p); a[k].setOnes();
+    b[k].resize(p,n); b[k].setOnes();
+    c[k].resize(m,n); c[k].setOnes();
+  }
+
+  BenchTimer t;
+
+  BENCH(t, 5, rep,
+        for(int k=0;k<N;++k)
+          blas_gemm(a[k],b[k],c[k]));
+
+//   BENCH(t, 5, rep,
+//         _Pragma("omp parallel for schedule(static,1)")
+//         for(int k=0;k<N;++k)
+//           blas_gemm(a[k],b[k],c[k]));
+
+  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*N*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
+  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*N*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  return 0;
+}
+
diff --git a/bench/btl/generic_bench/bench_parameter.hh b/bench/btl/generic_bench/bench_parameter.hh
index d14340037..4c355cd6e 100644
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@@ -48,6 +48,6 @@
 #define DEFAULT_NB_SAMPLE 1000
 
 // how many times we run a single bench (keep the best perf)
-#define NB_TRIES 3
+#define DEFAULT_NB_TRIES 3
 
 #endif
diff --git a/bench/btl/generic_bench/btl.hh b/bench/btl/generic_bench/btl.hh
index fdc099296..fd501c313 100644
--- a/bench/btl/generic_bench/btl.hh
+++ b/bench/btl/generic_bench/btl.hh
@@ -41,7 +41,7 @@
 #if (defined __GNUC__)
 #define BTL_ASM_COMMENT(X)  asm("#"X)
 #else
-#define BTL_ASM_COMMENT(X)  
+#define BTL_ASM_COMMENT(X)
 #endif
 
 #if (defined __GNUC__) && (!defined __INTEL_COMPILER)
@@ -169,7 +169,7 @@ class BtlConfig
 {
 public:
   BtlConfig()
-    : overwriteResults(false), checkResults(true)
+    : overwriteResults(false), checkResults(true), realclock(false), tries(DEFAULT_NB_TRIES)
   {
     char * _config;
     _config = getenv ("BTL_CONFIG");
@@ -189,6 +189,17 @@ public:
 
           i += 1;
         }
+        else if (config[i].beginsWith("-t"))
+        {
+          if (i+1==config.size())
+          {
+            std::cerr << "error processing option: " << config[i] << "\n";
+            exit(2);
+          }
+          Instance.tries = atoi(config[i+1].c_str());
+
+          i += 1;
+        }
         else if (config[i].beginsWith("--overwrite"))
         {
           Instance.overwriteResults = true;
@@ -197,6 +208,10 @@ public:
         {
           Instance.checkResults = false;
         }
+        else if (config[i].beginsWith("--real"))
+        {
+          Instance.realclock = true;
+        }
       }
     }
 
@@ -219,6 +234,8 @@ public:
   static BtlConfig Instance;
   bool overwriteResults;
   bool checkResults;
+  bool realclock;
+  int tries;
 
 protected:
   std::vector<BtlString> m_selectedActionNames;
diff --git a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
index 6b1f8e7d7..5c337471e 100644
--- a/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
+++ b/bench/btl/generic_bench/timers/portable_perf_analyzer.hh
@@ -53,7 +53,7 @@ public:
     }
 
     // optimize
-    for (int i=1; i<NB_TRIES; ++i)
+    for (int i=1; i<BtlConfig::Instance.tries; ++i)
     {
       Action _action(size);
       std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action*1e6) << " ";
diff --git a/bench/btl/generic_bench/timers/portable_timer.hh b/bench/btl/generic_bench/timers/portable_timer.hh
index 42528d113..e6ad309fe 100755
--- a/bench/btl/generic_bench/timers/portable_timer.hh
+++ b/bench/btl/generic_bench/timers/portable_timer.hh
@@ -98,70 +98,44 @@ class Portable_Timer
 {
  public:
 
-  Portable_Timer( void )
-//   :_utime_sec_start(-1),
-// 		_utime_usec_start(-1),
-// 		_utime_sec_stop(-1),
-// 		_utime_usec_stop(-1)/*,
-//         m_prev_cs(-1)*/
+  Portable_Timer()
   {
+    m_clkid = BtlConfig::Instance.realclock ? CLOCK_REALTIME : CLOCK_PROCESS_CPUTIME_ID;
   }
 
+  Portable_Timer(int clkid) : m_clkid(clkid)
+  {}
 
-  void   start()
+  void start()
   {
-//     int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
-//     _utime_sec_start  =  resourcesUsage.ru_utime.tv_sec ;
-//     _utime_usec_start =  resourcesUsage.ru_utime.tv_usec ;
-
     timespec ts;
-    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    clock_gettime(m_clkid, &ts);
     m_start_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
 
   }
 
   void stop()
   {
-//     int status=getrusage(RUSAGE_SELF, &resourcesUsage) ;
-//     _utime_sec_stop  =  resourcesUsage.ru_utime.tv_sec ;
-//     _utime_usec_stop =  resourcesUsage.ru_utime.tv_usec ;
-
     timespec ts;
-    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    clock_gettime(m_clkid, &ts);
     m_stop_time = double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
 
   }
 
   double elapsed()
   {
-    return  user_time();//double(_stop_time - _start_time) / CLOCKS_PER_SEC;
+    return  user_time();
   }
 
   double user_time()
   {
-//     std::cout << m_prev_cs << "\n";
-//     long tot_utime_sec=_utime_sec_stop-_utime_sec_start;
-//     long tot_utime_usec=_utime_usec_stop-_utime_usec_start;
-//     return double(tot_utime_sec)+ double(tot_utime_usec)/double(USEC_IN_SEC) ;
     return m_stop_time - m_start_time;
   }
 
 
 private:
 
-//   struct rusage resourcesUsage ;
-
-//   long _utime_sec_start ;
-//   long _utime_usec_start ;
-
-//   long _utime_sec_stop ;
-//   long _utime_usec_stop ;
-
-//   long m_prev_cs;
-
-//   std::clock_t _start_time;
-//   std::clock_t _stop_time;
-
+  int m_clkid;
   double m_stop_time, m_start_time;
 
 }; // Portable_Timer

From 32115bff1e2b99641e09e0fe182d2d5cc11413ec Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 26 Feb 2010 09:03:13 -0500
Subject: [PATCH 039/122] * add VERIFY_IS_EQUAL, should compile faster and it's
 natural when no arithmetic is involved. * rename 'submatrices' test to
 'block' * add block-inside-of-block tests * remove old cruft * split
 diagonal() tests into separate file

---
 Eigen/src/Core/Assign.h             |   2 +-
 Eigen/src/Core/Block.h              |   2 +-
 Eigen/src/Core/Matrix.h             |   2 +-
 test/CMakeLists.txt                 |   3 +-
 test/{submatrices.cpp => block.cpp} | 140 +++++++++++++++-------------
 test/diagonal.cpp                   |  81 ++++++++++++++++
 test/main.h                         |  36 +++++++
 7 files changed, 199 insertions(+), 67 deletions(-)
 rename test/{submatrices.cpp => block.cpp} (61%)
 create mode 100644 test/diagonal.cpp

diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 99d497449..38c68778f 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2007 Michael Olbrich <michael.olbrich@gmx.net>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index d3c4dfa99..e6cfb0859 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index eae2711f4..dc1be9ea2 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
 //
 // Eigen is free software; you can redistribute it and/or
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b0da2a1d8..c6b359ce9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -104,12 +104,13 @@ ei_add_test(cwiseop)
 ei_add_test(unalignedcount)
 ei_add_test(redux)
 ei_add_test(visitor)
+ei_add_test(block)
 ei_add_test(product_small)
 ei_add_test(product_large)
 ei_add_test(product_extra)
 ei_add_test(diagonalmatrices)
 ei_add_test(adjoint)
-ei_add_test(submatrices)
+ei_add_test(diagonal)
 ei_add_test(miscmatrices)
 ei_add_test(commainitializer)
 ei_add_test(smallvectors)
diff --git a/test/submatrices.cpp b/test/block.cpp
similarity index 61%
rename from test/submatrices.cpp
rename to test/block.cpp
index e71c28ceb..c180afb75 100644
--- a/test/submatrices.cpp
+++ b/test/block.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -51,16 +51,18 @@ template<typename Scalar> struct CheckMinor<Scalar,1,1>
     CheckMinor(MatrixType&, int, int) {}
 };
 
-template<typename MatrixType> void submatrices(const MatrixType& m)
+template<typename MatrixType> void block(const MatrixType& m)
 {
   /* this test covers the following files:
-     Row.h Column.h Block.h Minor.h DiagonalCoeffs.h
+     Row.h Column.h Block.h Minor.h
   */
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
   typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
-  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DynamicMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> DynamicVectorType;
+  
   int rows = m.rows();
   int cols = m.cols();
 
@@ -69,8 +71,6 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
              m3(rows, cols),
              mzero = MatrixType::Zero(rows, cols),
              ones = MatrixType::Ones(rows, cols);
-  SquareMatrixType identity = SquareMatrixType::Identity(rows, rows),
-                    square = SquareMatrixType::Random(rows, rows);
   VectorType v1 = VectorType::Random(rows),
              v2 = VectorType::Random(rows),
              v3 = VectorType::Random(rows),
@@ -84,9 +84,7 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
   int c2 = ei_random<int>(c1,cols-1);
 
   //check row() and col()
-  VERIFY_IS_APPROX(m1.col(c1).transpose(), m1.transpose().row(c1));
-  // FIXME perhaps we should re-enable that without the .eval()
-  VERIFY_IS_APPROX(m1.col(c1).dot(square.row(r1)), (square * m1.conjugate()).eval()(r1,c1));
+  VERIFY_IS_EQUAL(m1.col(c1).transpose(), m1.transpose().row(c1));
   //check operator(), both constant and non-constant, on row() and col()
   m1.row(r1) += s1 * m1.row(r2);
   m1.col(c1) += s1 * m1.col(c2);
@@ -96,9 +94,9 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
 
   RowVectorType br1(m1.block(r1,0,1,cols));
   VectorType bc1(m1.block(0,c1,rows,1));
-  VERIFY_IS_APPROX(b1, m1.block(r1,c1,1,1));
-  VERIFY_IS_APPROX(m1.row(r1), br1);
-  VERIFY_IS_APPROX(m1.col(c1), bc1);
+  VERIFY_IS_EQUAL(b1, m1.block(r1,c1,1,1));
+  VERIFY_IS_EQUAL(m1.row(r1), br1);
+  VERIFY_IS_EQUAL(m1.col(c1), bc1);
   //check operator(), both constant and non-constant, on block()
   m1.block(r1,c1,r2-r1+1,c2-c1+1) = s1 * m2.block(0, 0, r2-r1+1,c2-c1+1);
   m1.block(r1,c1,r2-r1+1,c2-c1+1)(r2-r1,c2-c1) = m2.block(0, 0, r2-r1+1,c2-c1+1)(0,0);
@@ -106,11 +104,6 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
   //check minor()
   CheckMinor<Scalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> checkminor(m1,r1,c1);
 
-  //check diagonal()
-  VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
-  m2.diagonal() = 2 * m1.diagonal();
-  m2.diagonal()[0] *= 3;
-
   const int BlockRows = EIGEN_ENUM_MIN(MatrixType::RowsAtCompileTime,2);
   const int BlockCols = EIGEN_ENUM_MIN(MatrixType::ColsAtCompileTime,5);
   if (rows>=5 && cols>=8)
@@ -121,45 +114,23 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
     m1.template block<BlockRows,BlockCols>(1,1)(0, 3) = m1.template block<2,5>(1,1)(1,2);
     // check that fixed block() and block() agree
     Matrix<Scalar,Dynamic,Dynamic> b = m1.template block<BlockRows,BlockCols>(3,3);
-    VERIFY_IS_APPROX(b, m1.block(3,3,BlockRows,BlockCols));
+    VERIFY_IS_EQUAL(b, m1.block(3,3,BlockRows,BlockCols));
   }
 
   if (rows>2)
   {
     // test sub vectors
-    VERIFY_IS_APPROX(v1.template head<2>(), v1.block(0,0,2,1));
-    VERIFY_IS_APPROX(v1.template head<2>(), v1.head(2));
-    VERIFY_IS_APPROX(v1.template head<2>(), v1.segment(0,2));
-    VERIFY_IS_APPROX(v1.template head<2>(), v1.template segment<2>(0));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.block(0,0,2,1));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.head(2));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.segment(0,2));
+    VERIFY_IS_EQUAL(v1.template head<2>(), v1.template segment<2>(0));
     int i = rows-2;
-    VERIFY_IS_APPROX(v1.template tail<2>(), v1.block(i,0,2,1));
-    VERIFY_IS_APPROX(v1.template tail<2>(), v1.tail(2));
-    VERIFY_IS_APPROX(v1.template tail<2>(), v1.segment(i,2));
-    VERIFY_IS_APPROX(v1.template tail<2>(), v1.template segment<2>(i));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.block(i,0,2,1));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.tail(2));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.segment(i,2));
+    VERIFY_IS_EQUAL(v1.template tail<2>(), v1.template segment<2>(i));
     i = ei_random(0,rows-2);
-    VERIFY_IS_APPROX(v1.segment(i,2), v1.template segment<2>(i));
-
-    enum {
-      N1 = MatrixType::RowsAtCompileTime>1 ?  1 : 0,
-      N2 = MatrixType::RowsAtCompileTime>2 ? -2 : 0
-    };
-
-    // check sub/super diagonal
-    m2.template diagonal<N1>() = 2 * m1.template diagonal<N1>();
-    m2.template diagonal<N1>()[0] *= 3;
-    VERIFY_IS_APPROX(m2.template diagonal<N1>()[0], static_cast<Scalar>(6) * m1.template diagonal<N1>()[0]);
-
-    m2.template diagonal<N2>() = 2 * m1.template diagonal<N2>();
-    m2.template diagonal<N2>()[0] *= 3;
-    VERIFY_IS_APPROX(m2.template diagonal<N2>()[0], static_cast<Scalar>(6) * m1.template diagonal<N2>()[0]);
-
-    m2.diagonal(N1) = 2 * m1.diagonal(N1);
-    m2.diagonal(N1)[0] *= 3;
-    VERIFY_IS_APPROX(m2.diagonal(N1)[0], static_cast<Scalar>(6) * m1.diagonal(N1)[0]);
-
-    m2.diagonal(N2) = 2 * m1.diagonal(N2);
-    m2.diagonal(N2)[0] *= 3;
-    VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+    VERIFY_IS_EQUAL(v1.segment(i,2), v1.template segment<2>(i));
   }
 
   // stress some basic stuffs with block matrices
@@ -168,6 +139,49 @@ template<typename MatrixType> void submatrices(const MatrixType& m)
 
   VERIFY(ei_real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows));
   VERIFY(ei_real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols));
+
+  // now test some block-inside-of-block.
+  
+  // expressions with direct access
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , (m1.block(r2,c2,rows-r2,cols-c2)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , (m1.col(c1).segment(r1,r2-r1+1)) );
+  VERIFY_IS_EQUAL( (m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_EQUAL( (m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , (m1.row(r1).segment(c1,c2-c1+1)) );
+
+  // expressions without direct access
+  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) );
+  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) );
+  VERIFY_IS_EQUAL( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+  VERIFY_IS_EQUAL( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) );
+
+  // evaluation into plain matrices from expressions with direct access (stress MapBase)
+  DynamicMatrixType dm;
+  DynamicVectorType dv;
+  dm.setZero();
+  dm = m1.block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2);
+  VERIFY_IS_EQUAL(dm, (m1.block(r2,c2,rows-r2,cols-c2)));
+  dm.setZero();
+  dv.setZero();
+  dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).row(0).transpose();
+  dv = m1.row(r1).segment(c1,c2-c1+1);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.col(c1).segment(r1,r2-r1+1);
+  dv = m1.block(r1,c1,r2-r1+1,c2-c1+1).col(0);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0);
+  dv = m1.row(r1).segment(c1,c2-c1+1);
+  VERIFY_IS_EQUAL(dv, dm);
+  dm.setZero();
+  dv.setZero();
+  dm = m1.row(r1).segment(c1,c2-c1+1).transpose();
+  dv = m1.transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0);
+  VERIFY_IS_EQUAL(dv, dm);
 }
 
 
@@ -185,22 +199,22 @@ void compare_using_data_and_stride(const MatrixType& m)
 
   for(int j=0;j<cols;++j)
     for(int i=0;i<rows;++i)
-      VERIFY_IS_APPROX(m.coeff(i,j), data[i*rowStride + j*colStride]);
+      VERIFY(m.coeff(i,j) == data[i*rowStride + j*colStride]);
 
   if(!MatrixType::IsVectorAtCompileTime)
   {
     for(int j=0;j<cols;++j)
       for(int i=0;i<rows;++i)
-        VERIFY_IS_APPROX(m.coeff(i,j), data[(MatrixType::Flags&RowMajorBit)
-                                            ? i*outerStride + j*innerStride
-                                            : j*outerStride + i*innerStride]);
+        VERIFY(m.coeff(i,j) == data[(MatrixType::Flags&RowMajorBit)
+                                     ? i*outerStride + j*innerStride
+                                     : j*outerStride + i*innerStride]);
   }
 
   if(MatrixType::IsVectorAtCompileTime)
   {
-    VERIFY_IS_APPROX(innerStride, int((&m.coeff(1))-(&m.coeff(0))));
+    VERIFY(innerStride == int((&m.coeff(1))-(&m.coeff(0))));
     for (int i=0;i<size;++i)
-      VERIFY_IS_APPROX(m.coeff(i), data[i*innerStride]);
+      VERIFY(m.coeff(i) == data[i*innerStride]);
   }
 }
 
@@ -224,17 +238,17 @@ void data_and_stride(const MatrixType& m)
   compare_using_data_and_stride(m1.col(c1).transpose());
 }
 
-void test_submatrices()
+void test_block()
 {
   for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST_1( submatrices(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST_2( submatrices(Matrix4d()) );
-    CALL_SUBTEST_3( submatrices(MatrixXcf(3, 3)) );
-    CALL_SUBTEST_4( submatrices(MatrixXi(8, 12)) );
-    CALL_SUBTEST_5( submatrices(MatrixXcd(20, 20)) );
-    CALL_SUBTEST_6( submatrices(MatrixXf(20, 20)) );
+    CALL_SUBTEST_1( block(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( block(Matrix4d()) );
+    CALL_SUBTEST_3( block(MatrixXcf(3, 3)) );
+    CALL_SUBTEST_4( block(MatrixXi(8, 12)) );
+    CALL_SUBTEST_5( block(MatrixXcd(20, 20)) );
+    CALL_SUBTEST_6( block(MatrixXf(20, 20)) );
 
-    CALL_SUBTEST_8( submatrices(Matrix<float,Dynamic,4>(3, 4)) );
+    CALL_SUBTEST_8( block(Matrix<float,Dynamic,4>(3, 4)) );
 
 #ifndef EIGEN_DEFAULT_TO_ROW_MAJOR
     CALL_SUBTEST_6( data_and_stride(MatrixXf(ei_random(5,50), ei_random(5,50))) );
diff --git a/test/diagonal.cpp b/test/diagonal.cpp
new file mode 100644
index 000000000..288d58c6e
--- /dev/null
+++ b/test/diagonal.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename MatrixType> void diagonal(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime> RowVectorType;
+  int rows = m.rows();
+  int cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols),
+             m2 = MatrixType::Random(rows, cols);
+
+  //check diagonal()
+  VERIFY_IS_APPROX(m1.diagonal(), m1.transpose().diagonal());
+  m2.diagonal() = 2 * m1.diagonal();
+  m2.diagonal()[0] *= 3;
+
+  if (rows>2)
+  {
+    enum {
+      N1 = MatrixType::RowsAtCompileTime>1 ?  1 : 0,
+      N2 = MatrixType::RowsAtCompileTime>2 ? -2 : 0
+    };
+
+    // check sub/super diagonal
+    m2.template diagonal<N1>() = 2 * m1.template diagonal<N1>();
+    m2.template diagonal<N1>()[0] *= 3;
+    VERIFY_IS_APPROX(m2.template diagonal<N1>()[0], static_cast<Scalar>(6) * m1.template diagonal<N1>()[0]);
+
+    m2.template diagonal<N2>() = 2 * m1.template diagonal<N2>();
+    m2.template diagonal<N2>()[0] *= 3;
+    VERIFY_IS_APPROX(m2.template diagonal<N2>()[0], static_cast<Scalar>(6) * m1.template diagonal<N2>()[0]);
+
+    m2.diagonal(N1) = 2 * m1.diagonal(N1);
+    m2.diagonal(N1)[0] *= 3;
+    VERIFY_IS_APPROX(m2.diagonal(N1)[0], static_cast<Scalar>(6) * m1.diagonal(N1)[0]);
+
+    m2.diagonal(N2) = 2 * m1.diagonal(N2);
+    m2.diagonal(N2)[0] *= 3;
+    VERIFY_IS_APPROX(m2.diagonal(N2)[0], static_cast<Scalar>(6) * m1.diagonal(N2)[0]);
+  }
+}
+
+void test_diagonal()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( diagonal(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( diagonal(Matrix4d()) );
+    CALL_SUBTEST_2( diagonal(MatrixXcf(3, 3)) );
+    CALL_SUBTEST_2( diagonal(MatrixXi(8, 12)) );
+    CALL_SUBTEST_2( diagonal(MatrixXcd(20, 20)) );
+    CALL_SUBTEST_1( diagonal(MatrixXf(21, 19)) );
+    CALL_SUBTEST_1( diagonal(Matrix<float,Dynamic,4>(3, 4)) );
+  }
+}
diff --git a/test/main.h b/test/main.h
index 96324de33..5ca9395c2 100644
--- a/test/main.h
+++ b/test/main.h
@@ -157,6 +157,7 @@ namespace Eigen
     exit(2); \
   } } while (0)
 
+#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
 #define VERIFY_IS_APPROX(a, b) VERIFY(test_ei_isApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_ei_isApprox(a, b))
 #define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_ei_isMuchSmallerThan(a, b))
@@ -342,6 +343,41 @@ inline bool test_isUnitary(const MatrixBase<Derived>& m)
   return m.isUnitary(test_precision<typename ei_traits<Derived>::Scalar>());
 }
 
+template<typename Derived1, typename Derived2,
+         bool IsVector = bool(Derived1::IsVectorAtCompileTime) && bool(Derived2::IsVectorAtCompileTime) >
+struct test_is_equal_impl
+{
+  static bool run(const Derived1& a1, const Derived2& a2)
+  {
+    if(a1.size() != a2.size()) return false;
+    // we evaluate a2 into a temporary of the shape of a1. this allows to let Assign.h handle the transposing if needed.
+    typename Derived1::PlainObject a2_evaluated(a2);
+    for(int i = 0; i < a1.size(); ++i)
+      if(a1.coeff(i) != a2_evaluated.coeff(i)) return false;
+    return true;
+  }
+};
+
+template<typename Derived1, typename Derived2>
+struct test_is_equal_impl<Derived1, Derived2, false>
+{
+  static bool run(const Derived1& a1, const Derived2& a2)
+  {
+    if(a1.rows() != a2.rows()) return false;
+    if(a1.cols() != a2.cols()) return false;
+    for(int j = 0; j < a1.cols(); ++j)
+      for(int i = 0; i < a1.rows(); ++i)
+        if(a1.coeff(i,j) != a2.coeff(i,j)) return false;
+    return true;
+  }
+};
+
+template<typename Derived1, typename Derived2>
+bool test_is_equal(const Derived1& a1, const Derived2& a2)
+{
+  return test_is_equal_impl<Derived1, Derived2>::run(a1, a2);
+}
+
 /** Creates a random Partial Isometry matrix of given rank.
   *
   * A partial isometry is a matrix all of whose singular values are either 0 or 1.

From c72a5074e681b6680378a2231a8c4270aa7f23db Mon Sep 17 00:00:00 2001
From: nerbonne <emn13+hg@nerbonne.org>
Date: Fri, 26 Feb 2010 15:46:43 +0100
Subject: [PATCH 040/122] Fixed perf problems for vector subtraction: inlining
 wasn't always happening when necessary.

---
 Eigen/src/Core/Assign.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index 9440cebf1..f760b6f6a 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -406,7 +406,7 @@ struct ei_unaligned_assign_impl<false>
 template<typename Derived1, typename Derived2>
 struct ei_assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling>
 {
-  inline static void run(Derived1 &dst, const Derived2 &src)
+  EIGEN_STRONG_INLINE static void run(Derived1 &dst, const Derived2 &src)
   {
     const int size = dst.size();
     const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;

From 6924bf2e99749773e1ae93caa2f60a1e3b386a0c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 26 Feb 2010 15:58:22 +0100
Subject: [PATCH 041/122] implement Aron's idea of interleaving the packing
 with the first computations

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 48 ++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 6f7dee743..b6123ca8b 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -77,21 +77,19 @@ static void run(int rows, int cols, int depth,
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   typedef ei_product_blocking_traits<Scalar> Blocking;
 
-//   int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
-//   int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
-
-  int kc = std::min<int>(256,depth);  // cache block size along the K direction
-  int mc = std::min<int>(512,rows);   // cache block size along the M direction
+  int kc = std::min<int>(Blocking::Max_kc,depth);  // cache block size along the K direction
+  int mc = std::min<int>(Blocking::Max_mc,rows);   // cache block size along the M direction
 
   ei_gemm_pack_rhs<Scalar, Blocking::nr, RhsStorageOrder> pack_rhs;
   ei_gemm_pack_lhs<Scalar, Blocking::mr, LhsStorageOrder> pack_lhs;
   ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp;
 
-  #ifdef EIGEN_HAS_OPENMP
+#ifdef EIGEN_HAS_OPENMP
   if(info)
   {
     // this is the parallel version!
     int tid = omp_get_thread_num();
+    int threads = omp_get_num_threads();
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeW = kc*Blocking::PacketSize*Blocking::nr*8;
@@ -109,20 +107,48 @@ static void run(int rows, int cols, int depth,
     // (==GEMM_VAR1)
     for(int k=0; k<depth; k+=kc)
     {
-      #pragma omp barrier
-      const int actual_kc = std::min(k+kc,depth)-k;
-
       // pack B_k to B' in parallel fashion,
       // each thread packs the B_k,j sub block to B'_j where j is the thread id
+
+      // TODO before copying to B'_j, makes sure that no other threads are using it!
+      // currently done using a barrier
+      #pragma omp barrier
+
+      const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A'
+
       #ifndef USEGOTOROUTINES
       pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
       #else
       sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
       #endif
 
-      #pragma omp barrier
+#if 0
+      // this is an attempt to implement a smarter strategy as suggested by Aron
+      // the layout is good, but there is no synchronization yet
+      {
+        const int actual_mc = mc;
 
+        // pack to A'
+        pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, actual_mc);
+
+        // use our current thread's B' part right away, no need to wait for the other threads
+        sgemm_kernel(actual_mc, info[tid].rhs_length, actual_kc, alpha, blockA, blockB+info[tid].rhs_start*kc, res+info[tid].rhs_start*resStride, resStride);
+
+        for(int shift=1; shift<threads; ++shift)
+        {
+          int j = (tid+shift)%threads;
+
+          // TODO here we have to makes sure that thread j is done with packing B'_j
+          sgemm_kernel(actual_mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride);
+        }
+      }
+
+      // then keep going as usual with the remaining A'
+      for(int i=mc; i<rows; i+=mc)
+#else
+      #pragma omp barrier
       for(int i=0; i<rows; i+=mc)
+#endif
       {
         const int actual_mc = std::min(i+mc,rows)-i;
 
@@ -146,7 +172,7 @@ static void run(int rows, int cols, int depth,
     ei_aligned_stack_delete(Scalar, w, sizeW);
   }
   else
-  #endif
+#endif // EIGEN_HAS_OPENMP
   {
     // this is the sequential version!
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);

From b1f666d007ee4dc22f48dd65f900ee3659dd1f7d Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 26 Feb 2010 20:12:51 -0500
Subject: [PATCH 042/122] Fix Map-with-Stride and cover it by new unit tests.

---
 Eigen/src/Core/DenseStorageBase.h         |   3 +
 Eigen/src/Core/Map.h                      |  35 +++---
 Eigen/src/Core/Stride.h                   |  30 ++---
 Eigen/src/Core/util/Constants.h           |  10 +-
 Eigen/src/Core/util/ForwardDeclarations.h |   2 +-
 test/CMakeLists.txt                       |   1 +
 test/map.cpp                              |  14 +--
 test/mapstride.cpp                        | 139 ++++++++++++++++++++++
 test/vectorization_logic.cpp              |  17 +++
 9 files changed, 203 insertions(+), 48 deletions(-)
 create mode 100644 test/mapstride.cpp

diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 89e6e7112..12ffd2e43 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -362,6 +362,9 @@ class DenseStorageBase : public _Base<Derived>
       * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
       * \a data pointers.
       *
+      * These methods do not allow to specify strides. If you need to specify strides, you have to
+      * use the Map class directly.
+      *
       * \see class Map
       */
     //@{
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index d9ccb1b20..f8b70b866 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -52,11 +52,23 @@ template<typename MatrixType, int Options, typename StrideType>
 struct ei_traits<Map<MatrixType, Options, StrideType> >
   : public ei_traits<MatrixType>
 {
+  typedef typename MatrixType::Scalar Scalar;
   enum {
+    InnerStride = StrideType::InnerStrideAtCompileTime,
+    OuterStride = StrideType::OuterStrideAtCompileTime,
+    HasNoInnerStride = InnerStride <= 1,
+    HasNoOuterStride = OuterStride == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsAligned = int(int(Options)&Aligned)==Aligned,
+    IsDynamicSize = MatrixType::SizeAtCompileTime==Dynamic,
+    KeepsPacketAccess = bool(HasNoInnerStride)
+                        && ( bool(IsDynamicSize)
+                           || HasNoOuterStride
+                           || ( OuterStride!=Dynamic && ((int(OuterStride)*sizeof(Scalar))%16)==0 ) ),
     Flags0 = ei_traits<MatrixType>::Flags,
-    Flags1 = ((Options&Aligned)==Aligned ? Flags0 |  AlignedBit
-                                         : Flags0 & ~AlignedBit),
-    Flags = int(StrideType::InnerStrideAtCompileTime)==1 ? Flags1 : (Flags1 & ~PacketAccessBit)
+    Flags1 = IsAligned ? int(Flags0) |  AlignedBit : int(Flags0) & ~AlignedBit,
+    Flags2 = HasNoStride ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
+    Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit)
   };
 };
 
@@ -94,23 +106,6 @@ template<typename MatrixType, int Options, typename StrideType> class Map
     inline Map(const Scalar* data, int rows, int cols, const StrideType& stride = StrideType())
       : Base(data, rows, cols), m_stride(stride) {}
 
-/*
-    inline void resize(int rows, int cols)
-    {
-      EIGEN_ONLY_USED_FOR_DEBUG(rows);
-      EIGEN_ONLY_USED_FOR_DEBUG(cols);
-      ei_assert(rows == this->rows());
-      ei_assert(cols == this->cols());
-    }
-
-    inline void resize(int size)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(MatrixType)
-      EIGEN_ONLY_USED_FOR_DEBUG(size);
-      ei_assert(size == this->size());
-    }
-*/
-
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
 
   protected:
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 7982035fd..f04039e7d 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -25,7 +25,7 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
-template<int _InnerStrideAtCompileTime, int _OuterStrideAtCompileTime>
+template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
   public:
@@ -36,45 +36,45 @@ class Stride
     };
 
     Stride()
-      : m_inner(InnerStrideAtCompileTime), m_outer(OuterStrideAtCompileTime)
+      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
       ei_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
     }
 
-    Stride(int innerStride, int outerStride)
-      : m_inner(innerStride), m_outer(outerStride)
+    Stride(int outerStride, int innerStride)
+      : m_outer(outerStride), m_inner(innerStride)
     {
       ei_assert(innerStride>=0 && outerStride>=0);
     }
 
     Stride(const Stride& other)
-      : m_inner(other.inner()), m_outer(other.outer())
+      : m_outer(other.outer()), m_inner(other.inner())
     {}
 
-    inline int inner() const { return m_inner.value(); }
     inline int outer() const { return m_outer.value(); }
+    inline int inner() const { return m_inner.value(); }
 
   protected:
-    ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
     ei_int_if_dynamic<OuterStrideAtCompileTime> m_outer;
+    ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
 };
 
-template<int Value = Dynamic>
-class InnerStride : public Stride<Value, 0>
+template<int Value>
+class InnerStride : public Stride<0, Value>
 {
-    typedef Stride<Value,0> Base;
+    typedef Stride<0, Value> Base;
   public:
     InnerStride() : Base() {}
-    InnerStride(int v) : Base(v,0) {}
+    InnerStride(int v) : Base(0, v) {}
 };
 
-template<int Value = Dynamic>
-class OuterStride : public Stride<0, Value>
+template<int Value>
+class OuterStride : public Stride<Value, 0>
 {
-    typedef Stride<0,Value> Base;
+    typedef Stride<Value, 0> Base;
   public:
     OuterStride() : Base() {}
-    OuterStride(int v) : Base(0,v) {}
+    OuterStride(int v) : Base(v,0) {}
 };
 
 #endif // EIGEN_STRIDE_H
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index c27c979a6..c167df697 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -86,11 +86,11 @@ const unsigned int EvalBeforeAssigningBit = 0x4;
   * Long version: means that the coefficients can be handled by packets
   * and start at a memory location whose alignment meets the requirements
   * of the present CPU architecture for optimized packet access. In the fixed-size
-  * case, there is the additional condition that the total size of the coefficients
-  * array is a multiple of the packet size, so that it is possible to access all the
-  * coefficients by packets. In the dynamic-size case, there is no such condition
-  * on the total size, so it might not be possible to access the few last coeffs
-  * by packets.
+  * case, there is the additional condition that it be possible to access all the
+  * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
+  * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
+  * there is no such condition on the total size and strides, so it might not be possible to access
+  * all coeffs by packets.
   *
   * \note This bit can be set regardless of whether vectorization is actually enabled.
   *       To check for actual vectorizability, see \a ActualPacketAccessBit.
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 6096272fa..8451d0ebe 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -61,7 +61,7 @@ template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeA
 template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
 template<typename MatrixType, int Index> class Diagonal;
 
-template<int InnerStrideAtCompileTime = Dynamic, int OuterStrideAtCompileTime = Dynamic> class Stride;
+template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
 template<typename MatrixType, int Options=Unaligned, typename StrideType = Stride<0,0> > class Map;
 
 template<typename Derived> class TriangularBase;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c6b359ce9..072f63e1d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -115,6 +115,7 @@ ei_add_test(miscmatrices)
 ei_add_test(commainitializer)
 ei_add_test(smallvectors)
 ei_add_test(map)
+ei_add_test(mapstride)
 ei_add_test(array)
 ei_add_test(array_for_matrix)
 ei_add_test(array_replicate)
diff --git a/test/map.cpp b/test/map.cpp
index 603b6159b..acaa8fecc 100644
--- a/test/map.cpp
+++ b/test/map.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -42,8 +42,8 @@ template<typename VectorType> void map_class_vector(const VectorType& m)
   VectorType ma1 = Map<VectorType, Aligned>(array1, size);
   VectorType ma2 = Map<VectorType, Aligned>(array2, size);
   VectorType ma3 = Map<VectorType>(array3unaligned, size);
-  VERIFY_IS_APPROX(ma1, ma2);
-  VERIFY_IS_APPROX(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma3);
   VERIFY_RAISES_ASSERT((Map<VectorType,Aligned>(array3unaligned, size)));
 
   ei_aligned_delete(array1, size);
@@ -70,9 +70,9 @@ template<typename MatrixType> void map_class_matrix(const MatrixType& m)
   Map<MatrixType>(array3unaligned, rows, cols) = Map<MatrixType>(array1, rows, cols);
   MatrixType ma1 = Map<MatrixType>(array1, rows, cols);
   MatrixType ma2 = Map<MatrixType, Aligned>(array2, rows, cols);
-  VERIFY_IS_APPROX(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma2);
   MatrixType ma3 = Map<MatrixType>(array3unaligned, rows, cols);
-  VERIFY_IS_APPROX(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, ma3);
 
   ei_aligned_delete(array1, size);
   ei_aligned_delete(array2, size);
@@ -97,8 +97,8 @@ template<typename VectorType> void map_static_methods(const VectorType& m)
   VectorType ma1 = VectorType::Map(array1, size);
   VectorType ma2 = VectorType::MapAligned(array2, size);
   VectorType ma3 = VectorType::Map(array3unaligned, size);
-  VERIFY_IS_APPROX(ma1, ma2);
-  VERIFY_IS_APPROX(ma1, ma3);
+  VERIFY_IS_EQUAL(ma1, ma2);
+  VERIFY_IS_EQUAL(ma1, ma3);
 
   ei_aligned_delete(array1, size);
   ei_aligned_delete(array2, size);
diff --git a/test/mapstride.cpp b/test/mapstride.cpp
new file mode 100644
index 000000000..7a1605681
--- /dev/null
+++ b/test/mapstride.cpp
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename VectorType> void map_class_vector(const VectorType& m)
+{
+  typedef typename VectorType::Scalar Scalar;
+
+  int size = m.size();
+
+  VectorType v = VectorType::Random(size);
+
+  int arraysize = 3*size;
+  
+  Scalar* array = ei_aligned_new<Scalar>(arraysize);
+
+  {
+    Map<VectorType, Aligned, InnerStride<3> > map(array, size);
+    map = v;
+    for(int i = 0; i < size; ++i)
+    {
+      VERIFY(array[3*i] == v[i]);
+      VERIFY(map[i] == v[i]);
+    }
+  }
+
+  {
+    Map<VectorType, Unaligned, InnerStride<Dynamic> > map(array, size, InnerStride<Dynamic>(2));
+    map = v;
+    for(int i = 0; i < size; ++i)
+    {
+      VERIFY(array[2*i] == v[i]);
+      VERIFY(map[i] == v[i]);
+    }
+  }
+
+  ei_aligned_delete(array, arraysize);
+}
+
+template<typename MatrixType> void map_class_matrix(const MatrixType& _m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  int rows = _m.rows(), cols = _m.cols();
+
+  MatrixType m = MatrixType::Random(rows,cols);
+
+  int arraysize = 2*(rows+4)*(cols+4);
+
+  Scalar* array = ei_aligned_new<Scalar>(arraysize);
+
+  // test no inner stride and some dynamic outer stride
+  {
+    Map<MatrixType, Aligned, OuterStride<Dynamic> > map(array, rows, cols, OuterStride<Dynamic>(m.innerSize()+1));
+    map = m;
+    VERIFY(map.outerStride() == map.innerSize()+1);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+  }
+
+  // test no inner stride and an outer stride of +4. This is quite important as for fixed-size matrices,
+  // this allows to hit the special case where it's vectorizable.
+  {
+    enum {
+      InnerSize = MatrixType::InnerSizeAtCompileTime,
+      OuterStrideAtCompileTime = InnerSize==Dynamic ? Dynamic : InnerSize+4
+    };
+    Map<MatrixType, Aligned, OuterStride<OuterStrideAtCompileTime> >
+      map(array, rows, cols, OuterStride<OuterStrideAtCompileTime>(m.innerSize()+4));
+    map = m;
+    VERIFY(map.outerStride() == map.innerSize()+4);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+  }
+
+  // test both inner stride and outer stride
+  {
+    Map<MatrixType, Aligned, Stride<Dynamic,Dynamic> > map(array, rows, cols, Stride<Dynamic,Dynamic>(2*m.innerSize()+1, 2));
+    map = m;
+    VERIFY(map.outerStride() == 2*map.innerSize()+1);
+    VERIFY(map.innerStride() == 2);
+    for(int i = 0; i < m.outerSize(); ++i)
+      for(int j = 0; j < m.innerSize(); ++j)
+      {
+        VERIFY(array[map.outerStride()*i+map.innerStride()*j] == m.coeffByOuterInner(i,j));
+        VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j));
+      }
+  }
+
+  ei_aligned_delete(array, arraysize);
+}
+
+void test_mapstride()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( map_class_vector(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_vector(Vector4d()) );
+    CALL_SUBTEST_3( map_class_vector(RowVector4f()) );
+    CALL_SUBTEST_4( map_class_vector(VectorXcf(8)) );
+    CALL_SUBTEST_5( map_class_vector(VectorXi(12)) );
+
+    CALL_SUBTEST_1( map_class_matrix(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST_2( map_class_matrix(Matrix4d()) );
+    CALL_SUBTEST_3( map_class_matrix(Matrix<float,3,5>()) );
+    CALL_SUBTEST_3( map_class_matrix(Matrix<float,4,8>()) );
+    CALL_SUBTEST_4( map_class_matrix(MatrixXcf(ei_random<int>(1,10),ei_random<int>(1,10))) );
+    CALL_SUBTEST_5( map_class_matrix(MatrixXi(5,5)));//ei_random<int>(1,10),ei_random<int>(1,10))) );
+  }
+}
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 5d86df7b3..ae9911831 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -33,6 +33,14 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
     && ei_assign_traits<Dst,Src>::Unrolling==unrolling;
 }
 
+template<typename Dst, typename Src>
+bool test_assign(int traversal, int unrolling)
+{
+  ei_assign_traits<Dst,Src>::debug();
+  return ei_assign_traits<Dst,Src>::Traversal==traversal
+    && ei_assign_traits<Dst,Src>::Unrolling==unrolling;
+}
+
 template<typename Xpr>
 bool test_redux(const Xpr&, int traversal, int unrolling)
 {
@@ -86,6 +94,15 @@ void test_vectorization_logic()
   VERIFY(test_assign(MatrixXf(10,10),MatrixXf(20,20).block(10,10,2,3),
     SliceVectorizedTraversal,NoUnrolling));
 
+  VERIFY((test_assign<
+           Map<Matrix<float,4,8>, Aligned, OuterStride<12> >,
+           Matrix<float,4,8>
+          >(InnerVectorizedTraversal,CompleteUnrolling)));
+
+  VERIFY((test_assign<
+           Map<Matrix<float,4,8>, Aligned, InnerStride<12> >,
+           Matrix<float,4,8>
+          >(DefaultTraversal,CompleteUnrolling)));
 
   VERIFY(test_redux(VectorXf(10),
     LinearVectorizedTraversal,NoUnrolling));

From 4927841cba82e9fe2898e4164fa9105436f739a7 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 26 Feb 2010 21:29:04 -0500
Subject: [PATCH 043/122] Document Map and Stride, add examples.

---
 Eigen/src/Array/Reverse.h                   |  2 +-
 Eigen/src/Core/Map.h                        | 45 ++++++++++++++++++++-
 Eigen/src/Core/Stride.h                     | 33 +++++++++++++++
 unsupported/doc/examples/MatrixFunction.cpp |  2 +-
 4 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Array/Reverse.h b/Eigen/src/Array/Reverse.h
index fe7de53b6..07b9f77b7 100644
--- a/Eigen/src/Array/Reverse.h
+++ b/Eigen/src/Array/Reverse.h
@@ -81,11 +81,11 @@ template<typename MatrixType, int Direction> class Reverse
 
     typedef typename MatrixType::template MakeBase< Reverse<MatrixType, Direction> >::Type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    using Base::IsRowMajor;
 
   protected:
     enum {
       PacketSize = ei_packet_traits<Scalar>::size,
-      IsRowMajor = MatrixType::IsRowMajor,
       IsColMajor = !IsRowMajor,
       ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
       ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index f8b70b866..6e9a5439e 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -33,10 +33,35 @@
   * \param MatrixType the equivalent matrix type of the mapped data
   * \param Options specifies whether the pointer is \c Aligned, or \c Unaligned.
   *                The default is \c Unaligned.
+  * \param StrideType optionnally specifies strides. By default, Map assumes the memory layout
+  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
+  *                   The type passed here must be a specialization of the Stride template, see examples below.
   *
   * This class represents a matrix or vector expression mapping an existing array of data.
   * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
-  * such as plain C arrays or structures from other libraries.
+  * such as plain C arrays or structures from other libraries. By default, it assumes that the
+  * data is laid out contiguously in memory. You can however override this by explicitly specifying
+  * inner and outer strides.
+  *
+  * Here's an example of simply mapping a contiguous array as a column-major matrix:
+  * \include Map_simple.cpp
+  * Output: \verbinclude Map_simple.out
+  *
+  * If you need to map non-contiguous arrays, you can do so by specifying strides:
+  *
+  * Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
+  * increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
+  * fixed value.
+  * \include Map_inner_stride.cpp
+  * Output: \verbinclude Map_inner_stride.out
+  *
+  * Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
+  * as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
+  * Here, we're specifying the outer stride as a runtime parameter.
+  * \include Map_outer_stride.cpp
+  * Output: \verbinclude Map_outer_stride.out
+  *
+  * For more details and for an example of specifying both an inner and an outer stride, see class Stride.
   *
   * \b Tip: to change the array of data mapped by a Map object, you can use the C++
   * placement new syntax:
@@ -97,12 +122,30 @@ template<typename MatrixType, int Options, typename StrideType> class Map
            : this->rows();
     }
 
+    /** Constructor in the fixed-size case.
+      *
+      * \param data pointer to the array to map
+      * \param stride optional Stride object, passing the strides.
+      */
     inline Map(const Scalar* data, const StrideType& stride = StrideType())
       : Base(data), m_stride(stride) {}
 
+    /** Constructor in the dynamic-size vector case.
+      *
+      * \param data pointer to the array to map
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
+      */
     inline Map(const Scalar* data, int size, const StrideType& stride = StrideType())
       : Base(data, size), m_stride(stride) {}
 
+    /** Constructor in the dynamic-size matrix case.
+      *
+      * \param data pointer to the array to map
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
+      */
     inline Map(const Scalar* data, int rows, int cols, const StrideType& stride = StrideType())
       : Base(data, rows, cols), m_stride(stride) {}
 
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index f04039e7d..d960dd2fc 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -25,6 +25,32 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
+/** \class Stride
+  *
+  * \brief Holds strides information for Map
+  *
+  * This class holds the strides information for mapping arrays with strides with class Map.
+  *
+  * It holds two values: the inner stride and the outer stride.
+  *
+  * The inner stride is the pointer increment between two consecutive entries within a given row of a
+  * row-major matrix or within a given column of a column-major matrix.
+  *
+  * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
+  * between two consecutive columns of a column-major matrix.
+  *
+  * These two values can be passed either at compile-time as template parameters, or at runtime as
+  * arguments to the constructor.
+  *
+  * Indeed, this class takes two template parameters:
+  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
+  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
+  *
+  * \include Map_general_stride.cpp
+  * Output: \verbinclude Map_general_stride.out
+  *
+  * \sa class InnerStride, class OuterStride
+  */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
@@ -35,23 +61,28 @@ class Stride
       OuterStrideAtCompileTime = _OuterStrideAtCompileTime
     };
 
+    /** Default constructor, for use when strides are fixed at compile time */
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
       ei_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
     }
 
+    /** Constructor allowing to pass the strides at runtime */
     Stride(int outerStride, int innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
       ei_assert(innerStride>=0 && outerStride>=0);
     }
 
+    /** Copy constructor */
     Stride(const Stride& other)
       : m_outer(other.outer()), m_inner(other.inner())
     {}
 
+    /** \returns the outer stride */
     inline int outer() const { return m_outer.value(); }
+    /** \returns the inner stride */
     inline int inner() const { return m_inner.value(); }
 
   protected:
@@ -59,6 +90,7 @@ class Stride
     ei_int_if_dynamic<InnerStrideAtCompileTime> m_inner;
 };
 
+/** \brief Convenience specialization of Stride to specify only an inner stride */
 template<int Value>
 class InnerStride : public Stride<0, Value>
 {
@@ -68,6 +100,7 @@ class InnerStride : public Stride<0, Value>
     InnerStride(int v) : Base(0, v) {}
 };
 
+/** \brief Convenience specialization of Stride to specify only an outer stride */
 template<int Value>
 class OuterStride : public Stride<Value, 0>
 {
diff --git a/unsupported/doc/examples/MatrixFunction.cpp b/unsupported/doc/examples/MatrixFunction.cpp
index 075fe7361..9b594cf39 100644
--- a/unsupported/doc/examples/MatrixFunction.cpp
+++ b/unsupported/doc/examples/MatrixFunction.cpp
@@ -18,5 +18,5 @@ int main()
 
   std::cout << "The matrix A is:\n" << A << "\n\n";
   std::cout << "The matrix exponential of A is:\n" 
-	    << ei_matrix_function(A, expfn) << "\n\n";
+            << ei_matrix_function(A, expfn) << "\n\n";
 }

From 814e40c72af6277875179672341204efa0a2c502 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 26 Feb 2010 21:46:43 -0500
Subject: [PATCH 044/122] let redux use the new ByOuterInner accessors

---
 Eigen/src/Core/Redux.h | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index d5b0c60c2..75297dcb9 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -40,7 +40,7 @@ struct ei_redux_traits
 private:
   enum {
     PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
-    InnerMaxSize = int(Derived::Flags)&RowMajorBit
+    InnerMaxSize = int(Derived::IsRowMajor)
                  ? Derived::MaxColsAtCompileTime
                  : Derived::MaxRowsAtCompileTime
   };
@@ -100,15 +100,15 @@ template<typename Func, typename Derived, int Start>
 struct ei_redux_novec_unroller<Func, Derived, Start, 1>
 {
   enum {
-    col = Start / Derived::RowsAtCompileTime,
-    row = Start % Derived::RowsAtCompileTime
+    outer = Start / Derived::InnerSizeAtCompileTime,
+    inner = Start % Derived::InnerSizeAtCompileTime
   };
 
   typedef typename Derived::Scalar Scalar;
 
   EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func&)
   {
-    return mat.coeff(row, col);
+    return mat.coeffByOuterInner(outer, inner);
   }
 };
 
@@ -148,12 +148,8 @@ struct ei_redux_vec_unroller<Func, Derived, Start, 1>
 {
   enum {
     index = Start * ei_packet_traits<typename Derived::Scalar>::size,
-    row = int(Derived::Flags)&RowMajorBit
-        ? index / int(Derived::ColsAtCompileTime)
-        : index % Derived::RowsAtCompileTime,
-    col = int(Derived::Flags)&RowMajorBit
-        ? index % int(Derived::ColsAtCompileTime)
-        : index / Derived::RowsAtCompileTime,
+    outer = index / int(Derived::InnerSizeAtCompileTime),
+    inner = index % int(Derived::InnerSizeAtCompileTime),
     alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
   };
 
@@ -162,7 +158,7 @@ struct ei_redux_vec_unroller<Func, Derived, Start, 1>
 
   EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func&)
   {
-    return mat.template packet<alignment>(row, col);
+    return mat.template packetByOuterInner<alignment>(outer, inner);
   }
 };
 
@@ -184,12 +180,12 @@ struct ei_redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
   {
     ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
     Scalar res;
-    res = mat.coeff(0, 0);
-    for(int i = 1; i < mat.rows(); ++i)
-      res = func(res, mat.coeff(i, 0));
-    for(int j = 1; j < mat.cols(); ++j)
-      for(int i = 0; i < mat.rows(); ++i)
-        res = func(res, mat.coeff(i, j));
+    res = mat.coeffByOuterInner(0, 0);
+    for(int i = 1; i < mat.innerSize(); ++i)
+      res = func(res, mat.coeffByOuterInner(0, i));
+    for(int i = 1; i < mat.outerSize(); ++i)
+      for(int j = 0; j < mat.innerSize(); ++j)
+        res = func(res, mat.coeffByOuterInner(i, j));
     return res;
   }
 };
@@ -253,8 +249,7 @@ struct ei_redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
     const int innerSize = mat.innerSize();
     const int outerSize = mat.outerSize();
     enum {
-      packetSize = ei_packet_traits<Scalar>::size,
-      isRowMajor = Derived::Flags&RowMajorBit?1:0
+      packetSize = ei_packet_traits<Scalar>::size
     };
     const int packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
@@ -263,13 +258,12 @@ struct ei_redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
       PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
       for(int j=0; j<outerSize; ++j)
         for(int i=0; i<packetedInnerSize; i+=int(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packet<Unaligned>
-                                                 (isRowMajor?j:i, isRowMajor?i:j));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
 
       res = func.predux(packet_res);
       for(int j=0; j<outerSize; ++j)
         for(int i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeff(isRowMajor?j:i, isRowMajor?i:j));
+          res = func(res, mat.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.

From b5c79e729122215a8210da24588c87a13f5c37d7 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 26 Feb 2010 22:26:21 -0500
Subject: [PATCH 045/122] add examples

---
 doc/snippets/Map_general_stride.cpp | 5 +++++
 doc/snippets/Map_inner_stride.cpp   | 5 +++++
 doc/snippets/Map_outer_stride.cpp   | 5 +++++
 doc/snippets/Map_simple.cpp         | 3 +++
 4 files changed, 18 insertions(+)
 create mode 100644 doc/snippets/Map_general_stride.cpp
 create mode 100644 doc/snippets/Map_inner_stride.cpp
 create mode 100644 doc/snippets/Map_outer_stride.cpp
 create mode 100644 doc/snippets/Map_simple.cpp

diff --git a/doc/snippets/Map_general_stride.cpp b/doc/snippets/Map_general_stride.cpp
new file mode 100644
index 000000000..0657e7f84
--- /dev/null
+++ b/doc/snippets/Map_general_stride.cpp
@@ -0,0 +1,5 @@
+int array[24];
+for(int i = 0; i < 24; ++i) array[i] = i;
+cout << Map<MatrixXi, 0, Stride<Dynamic,2> >
+         (array, 3, 3, Stride<Dynamic,2>(8, 2))
+     << endl;
diff --git a/doc/snippets/Map_inner_stride.cpp b/doc/snippets/Map_inner_stride.cpp
new file mode 100644
index 000000000..d95ae9b3e
--- /dev/null
+++ b/doc/snippets/Map_inner_stride.cpp
@@ -0,0 +1,5 @@
+int array[12];
+for(int i = 0; i < 12; ++i) array[i] = i;
+cout << Map<VectorXi, 0, InnerStride<2> >
+         (array, 6) // the inner stride has already been passed as template parameter
+     << endl;
diff --git a/doc/snippets/Map_outer_stride.cpp b/doc/snippets/Map_outer_stride.cpp
new file mode 100644
index 000000000..4bedaa508
--- /dev/null
+++ b/doc/snippets/Map_outer_stride.cpp
@@ -0,0 +1,5 @@
+int array[12];
+for(int i = 0; i < 12; ++i) array[i] = i;
+cout << Map<MatrixXi, 0, OuterStride<Dynamic> >
+         (array, 3, 3, OuterStride<Dynamic>(4))
+     << endl;
diff --git a/doc/snippets/Map_simple.cpp b/doc/snippets/Map_simple.cpp
new file mode 100644
index 000000000..423bb52ad
--- /dev/null
+++ b/doc/snippets/Map_simple.cpp
@@ -0,0 +1,3 @@
+int array[9];
+for(int i = 0; i < 9; ++i) array[i] = i;
+cout << Map<Matrix3i>(array) << endl;

From d9f638049994b90ed388c68c8a0ab7efc2e5615c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 27 Feb 2010 10:03:27 -0500
Subject: [PATCH 046/122] Remove the dot product's separate implementation and
 use cwiseProduct.sum instead. Also take special care to get nicely working
 static  assertions.

---
 Eigen/src/Core/Dot.h                        | 227 ++------------------
 Eigen/src/Core/products/CoeffBasedProduct.h |  20 +-
 2 files changed, 18 insertions(+), 229 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 201bd23ca..72f6c571d 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2006-2008, 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -25,224 +25,28 @@
 #ifndef EIGEN_DOT_H
 #define EIGEN_DOT_H
 
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for vectorization and unrolling
-***************************************************************************/
-
-template<typename Derived1, typename Derived2>
-struct ei_dot_traits
+// helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
+// with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
+// looking at the static assertions. Thus this is a trick to get better compile errors.
+template<typename T, typename U,
+         bool IsSameType = ei_is_same_type<typename T::Scalar, typename U::Scalar>::ret>
+struct ei_dot_nocheck
 {
-public:
-  enum {
-    Traversal = (int(Derived1::Flags)&int(Derived2::Flags)&ActualPacketAccessBit)
-                 && (int(Derived1::Flags)&int(Derived2::Flags)&LinearAccessBit)
-                  ? LinearVectorizedTraversal
-                  : DefaultTraversal
-  };
-
-private:
-  typedef typename Derived1::Scalar Scalar;
-  enum {
-    PacketSize = ei_packet_traits<Scalar>::size,
-    Cost = Derived1::SizeAtCompileTime * (Derived1::CoeffReadCost + Derived2::CoeffReadCost + NumTraits<Scalar>::MulCost)
-           + (Derived1::SizeAtCompileTime-1) * NumTraits<Scalar>::AddCost,
-    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
-  };
-
-public:
-  enum {
-    Unrolling = Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
-  };
-};
-
-/***************************************************************************
-* Part 2 : unrollers
-***************************************************************************/
-
-/*** no vectorization ***/
-
-template<typename Derived1, typename Derived2, int Start, int Length>
-struct ei_dot_novec_unroller
-{
-  enum {
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived1::Scalar Scalar;
-
-  inline static Scalar run(const Derived1& v1, const Derived2& v2)
+  static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
-    return ei_dot_novec_unroller<Derived1, Derived2, Start, HalfLength>::run(v1, v2)
-         + ei_dot_novec_unroller<Derived1, Derived2, Start+HalfLength, Length-HalfLength>::run(v1, v2);
+    return a.conjugate().cwiseProduct(b).sum();
   }
 };
 
-template<typename Derived1, typename Derived2, int Start>
-struct ei_dot_novec_unroller<Derived1, Derived2, Start, 1>
+template<typename T, typename U>
+struct ei_dot_nocheck<T, U, false>
 {
-  typedef typename Derived1::Scalar Scalar;
-
-  inline static Scalar run(const Derived1& v1, const Derived2& v2)
+  static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>&, const MatrixBase<U>&)
   {
-    return ei_conj(v1.coeff(Start)) * v2.coeff(Start);
+    return typename ei_traits<T>::Scalar(0);
   }
 };
 
-/*** vectorization ***/
-
-template<typename Derived1, typename Derived2, int Index, int Stop,
-         bool LastPacket = (Stop-Index == ei_packet_traits<typename Derived1::Scalar>::size)>
-struct ei_dot_vec_unroller
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  enum {
-    row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
-    col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
-    row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
-    col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0
-  };
-
-  inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
-  {
-    return ei_pmadd(
-      v1.template packet<Aligned>(row1, col1),
-      v2.template packet<Aligned>(row2, col2),
-      ei_dot_vec_unroller<Derived1, Derived2, Index+ei_packet_traits<Scalar>::size, Stop>::run(v1, v2)
-    );
-  }
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct ei_dot_vec_unroller<Derived1, Derived2, Index, Stop, true>
-{
-  enum {
-    row1 = Derived1::RowsAtCompileTime == 1 ? 0 : Index,
-    col1 = Derived1::RowsAtCompileTime == 1 ? Index : 0,
-    row2 = Derived2::RowsAtCompileTime == 1 ? 0 : Index,
-    col2 = Derived2::RowsAtCompileTime == 1 ? Index : 0,
-    alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
-    alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
-  };
-
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived1& v1, const Derived2& v2)
-  {
-    return ei_pmul(v1.template packet<alignment1>(row1, col1), v2.template packet<alignment2>(row2, col2));
-  }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = ei_dot_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = ei_dot_traits<Derived1, Derived2>::Unrolling
->
-struct ei_dot_impl;
-
-template<typename Derived1, typename Derived2>
-struct ei_dot_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling>
-{
-  typedef typename Derived1::Scalar Scalar;
-  static Scalar run(const Derived1& v1, const Derived2& v2)
-  {
-    ei_assert(v1.size()>0 && "you are using a non initialized vector");
-    Scalar res;
-    res = ei_conj(v1.coeff(0)) * v2.coeff(0);
-    for(int i = 1; i < v1.size(); ++i)
-      res += ei_conj(v1.coeff(i)) * v2.coeff(i);
-    return res;
-  }
-};
-
-template<typename Derived1, typename Derived2>
-struct ei_dot_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling>
-  : public ei_dot_novec_unroller<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-{};
-
-template<typename Derived1, typename Derived2>
-struct ei_dot_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  static Scalar run(const Derived1& v1, const Derived2& v2)
-  {
-    const int size = v1.size();
-    const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedSize = (size/packetSize)*packetSize;
-    enum {
-      alignment1 = (Derived1::Flags & AlignedBit) ? Aligned : Unaligned,
-      alignment2 = (Derived2::Flags & AlignedBit) ? Aligned : Unaligned
-    };
-    Scalar res;
-
-    // do the vectorizable part of the sum
-    if(size >= packetSize)
-    {
-      PacketScalar packet_res = ei_pmul(
-                                  v1.template packet<alignment1>(0),
-                                  v2.template packet<alignment2>(0)
-                                );
-      for(int index = packetSize; index<alignedSize; index += packetSize)
-      {
-        packet_res = ei_pmadd(
-                       v1.template packet<alignment1>(index),
-                       v2.template packet<alignment2>(index),
-                       packet_res
-                     );
-      }
-      res = ei_predux(packet_res);
-
-      // now we must do the rest without vectorization.
-      if(alignedSize == size) return res;
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
-    {
-      res = Scalar(0);
-    }
-
-    // do the remainder of the vector
-    for(int index = alignedSize; index < size; ++index)
-    {
-      res += v1.coeff(index) * v2.coeff(index);
-    }
-
-    return res;
-  }
-};
-
-template<typename Derived1, typename Derived2>
-struct ei_dot_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-  enum {
-    PacketSize = ei_packet_traits<Scalar>::size,
-    Size = Derived1::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
-  };
-  static Scalar run(const Derived1& v1, const Derived2& v2)
-  {
-    Scalar res = ei_predux(ei_dot_vec_unroller<Derived1, Derived2, 0, VectorizedSize>::run(v1, v2));
-    if (VectorizedSize != Size)
-      res += ei_dot_novec_unroller<Derived1, Derived2, VectorizedSize, Size-VectorizedSize>::run(v1, v2);
-    return res;
-  }
-};
-
-/***************************************************************************
-* Part 4 : implementation of MatrixBase methods
-***************************************************************************/
-
 /** \returns the dot product of *this with other.
   *
   * \only_for_vectors
@@ -266,10 +70,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 
   ei_assert(size() == other.size());
 
-  // dot() must honor EvalBeforeNestingBit (eg: v.dot(M*v) )
-  typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
-  typedef typename ei_cleantype<typename OtherDerived::Nested>::type OtherNested;
-  return ei_dot_impl<ThisNested, OtherNested>::run(derived(), other.derived());
+  return ei_dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }
 
 /** \returns the squared \em l2 norm of *this, i.e., for vectors, the dot product of *this with itself.
diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h
index 3343b1875..e8016e915 100644
--- a/Eigen/src/Core/products/CoeffBasedProduct.h
+++ b/Eigen/src/Core/products/CoeffBasedProduct.h
@@ -305,10 +305,7 @@ struct ei_product_coeff_vectorized_dyn_selector
 {
   EIGEN_STRONG_INLINE static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = ei_dot_impl<
-      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
-      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
-      LinearVectorizedTraversal, NoUnrolling>::run(lhs.row(row), rhs.col(col));
+    res = lhs.row(row).cwiseProduct(rhs.col(col)).sum();
   }
 };
 
@@ -319,10 +316,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
 {
   EIGEN_STRONG_INLINE static void run(int /*row*/, int col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = ei_dot_impl<
-      Lhs,
-      Block<Rhs, ei_traits<Rhs>::RowsAtCompileTime, 1>,
-      LinearVectorizedTraversal, NoUnrolling>::run(lhs, rhs.col(col));
+    res = lhs.cwiseProduct(rhs.col(col)).sum();
   }
 };
 
@@ -331,10 +325,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
 {
   EIGEN_STRONG_INLINE static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = ei_dot_impl<
-      Block<Lhs, 1, ei_traits<Lhs>::ColsAtCompileTime>,
-      Rhs,
-      LinearVectorizedTraversal, NoUnrolling>::run(lhs.row(row), rhs);
+    res = lhs.row(row).cwiseProduct(rhs).sum();
   }
 };
 
@@ -343,10 +334,7 @@ struct ei_product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
 {
   EIGEN_STRONG_INLINE static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
   {
-    res = ei_dot_impl<
-      Lhs,
-      Rhs,
-      LinearVectorizedTraversal, NoUnrolling>::run(lhs, rhs);
+    res = lhs.cwiseProduct(rhs).sum();
   }
 };
 

From 15a33622acfb195935adf190508d7e9e8238c4ee Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Sat, 27 Feb 2010 16:30:15 +0100
Subject: [PATCH 047/122] * define COMPARE(,), which prints expected/actual
 results in case of failure * use it in test/NonLinearOptimization.cpp

---
 test/main.h                                |  7 ++
 unsupported/test/NonLinearOptimization.cpp | 94 +++++++++++-----------
 2 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/test/main.h b/test/main.h
index 5ca9395c2..f4cfa11c5 100644
--- a/test/main.h
+++ b/test/main.h
@@ -157,6 +157,13 @@ namespace Eigen
     exit(2); \
   } } while (0)
 
+// Use COMPARE for exact comparison of scalar values (mostly, int)
+#define COMPARE(actual, expected) do { if (actual!=expected) { \
+    std::cerr << "Test " << g_test_stack.back() << ". Comparison failed in "EI_PP_MAKE_STRING(__FILE__) << " (" << EI_PP_MAKE_STRING(__LINE__) << ")" \
+      << std::endl << "    actual   = " << actual \
+      << std::endl << "    expected = " << expected << std::endl << std::endl; \
+    exit(2); \
+  } } while (0)
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
 #define VERIFY_IS_APPROX(a, b) VERIFY(test_ei_isApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_ei_isApprox(a, b))
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 7aea7b361..38d7b7766 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -172,9 +172,9 @@ void testLmder1()
   info = lm.lmder1(x);
 
   // check return value
-  VERIFY( 1 == info);
-  VERIFY(lm.nfev==6);
-  VERIFY(lm.njev==5);
+  COMPARE(info, 1);
+  COMPARE(lm.nfev, 6);
+  COMPARE(lm.njev, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -201,9 +201,9 @@ void testLmder()
   info = lm.minimize(x);
 
   // check return values
-  VERIFY( 1 == info);
-  VERIFY(lm.nfev==6);
-  VERIFY(lm.njev==5);
+  COMPARE(info, 1);
+  COMPARE(lm.nfev, 6);
+  COMPARE(lm.njev, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -286,7 +286,7 @@ void testHybrj1()
   info = solver.hybrj1(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY(solver.nfev==11);
   VERIFY(solver.njev==1);
 
@@ -321,7 +321,7 @@ void testHybrj()
   info = solver.solve(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY(solver.nfev==11);
   VERIFY(solver.njev==1);
 
@@ -375,7 +375,7 @@ void testHybrd1()
   info = solver.hybrd1(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY(solver.nfev==20);
 
   // check norm
@@ -406,7 +406,7 @@ void testHybrd()
   info = solver.solveNumericalDiff(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY(solver.nfev==14);
 
   // check norm
@@ -477,9 +477,9 @@ void testLmstr1()
   info = lm.lmstr1(x);
 
   // check return value
-  VERIFY( 1 == info);
-  VERIFY(lm.nfev==6);
-  VERIFY(lm.njev==5);
+  COMPARE(info, 1);
+  COMPARE(lm.nfev, 6);
+  COMPARE(lm.njev, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -506,9 +506,9 @@ void testLmstr()
   info = lm.minimizeOptimumStorage(x);
 
   // check return values
-  VERIFY( 1 == info);
-  VERIFY(lm.nfev==6);
-  VERIFY(lm.njev==5);
+  COMPARE(info, 1);
+  COMPARE(lm.nfev, 6);
+  COMPARE(lm.njev, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -562,7 +562,7 @@ void testLmdif1()
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY(nfev==26);
 
   // check norm
@@ -593,8 +593,8 @@ void testLmdif()
   info = lm.minimize(x);
 
   // check return values
-  VERIFY( 1 == info);
-  VERIFY(lm.nfev==26);
+  COMPARE(info, 1);
+  COMPARE(lm.nfev, 26);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -678,7 +678,7 @@ void testNistChwirut2(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 10 == lm.nfev);
   VERIFY( 8 == lm.njev);
   // check norm^2
@@ -699,7 +699,7 @@ void testNistChwirut2(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 7 == lm.nfev);
   VERIFY( 6 == lm.njev);
   // check norm^2
@@ -758,7 +758,7 @@ void testNistMisra1a(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 19 == lm.nfev);
   VERIFY( 15 == lm.njev);
   // check norm^2
@@ -775,7 +775,7 @@ void testNistMisra1a(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 5 == lm.nfev);
   VERIFY( 4 == lm.njev);
   // check norm^2
@@ -844,7 +844,7 @@ void testNistHahn1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 11== lm.nfev);
   VERIFY( 10== lm.njev);
   // check norm^2
@@ -866,7 +866,7 @@ void testNistHahn1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 11 == lm.nfev);
   VERIFY( 10 == lm.njev);
   // check norm^2
@@ -930,7 +930,7 @@ void testNistMisra1d(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 3 == info);
+  COMPARE(info, 3);
   VERIFY( 9 == lm.nfev);
   VERIFY( 7 == lm.njev);
   // check norm^2
@@ -947,7 +947,7 @@ void testNistMisra1d(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 4 == lm.nfev);
   VERIFY( 3 == lm.njev);
   // check norm^2
@@ -1008,7 +1008,7 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 2 == info);
+  COMPARE(info, 2);
   VERIFY( 79 == lm.nfev);
   VERIFY( 72 == lm.njev);
   // check norm^2
@@ -1029,7 +1029,7 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 2 == info);
+  COMPARE(info, 2);
   VERIFY( 9 == lm.nfev);
   VERIFY( 8 == lm.njev);
   // check norm^2
@@ -1094,7 +1094,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 10 == lm.nfev);
   VERIFY( 8 == lm.njev);
   // check norm^2
@@ -1112,7 +1112,7 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 6 == lm.nfev);
   VERIFY( 5 == lm.njev);
   // check norm^2
@@ -1172,7 +1172,7 @@ void testNistMGH10(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 2 == info); 
+  COMPARE(info, 2); 
   VERIFY( 284 == lm.nfev); 
   VERIFY( 249 == lm.njev); 
   // check norm^2
@@ -1190,7 +1190,7 @@ void testNistMGH10(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 3 == info);
+  COMPARE(info, 3);
   VERIFY( 126 == lm.nfev);
   VERIFY( 116 == lm.njev);
   // check norm^2
@@ -1251,7 +1251,7 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 31 == lm.nfev);
   VERIFY( 25  == lm.njev);
   // check norm^2
@@ -1271,7 +1271,7 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info); 
+  COMPARE(info, 1); 
   VERIFY( 15 == lm.nfev); 
   VERIFY( 14 == lm.njev); 
   // check norm^2
@@ -1333,7 +1333,7 @@ void testNistMGH17(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 2 == info); 
+  COMPARE(info, 2); 
   VERIFY( 602 == lm.nfev); 
   VERIFY( 545 == lm.njev); 
   // check norm^2
@@ -1354,7 +1354,7 @@ void testNistMGH17(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 18 == lm.nfev);
   VERIFY( 15 == lm.njev);
   // check norm^2
@@ -1420,7 +1420,7 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info); 
+  COMPARE(info, 1); 
   VERIFY( 490 == lm.nfev); 
   VERIFY( 376 == lm.njev); 
   // check norm^2
@@ -1440,7 +1440,7 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 18 == lm.nfev);
   VERIFY( 16 == lm.njev);
   // check norm^2
@@ -1503,7 +1503,7 @@ void testNistBennett5(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 758 == lm.nfev);
   VERIFY( 744 == lm.njev);
   // check norm^2
@@ -1521,7 +1521,7 @@ void testNistBennett5(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 203 == lm.nfev);
   VERIFY( 192 == lm.njev);
   // check norm^2
@@ -1591,7 +1591,7 @@ void testNistThurber(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 39 == lm.nfev);
   VERIFY( 36== lm.njev);
   // check norm^2
@@ -1616,7 +1616,7 @@ void testNistThurber(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 29 == lm.nfev);
   VERIFY( 28 == lm.njev);
   // check norm^2
@@ -1683,7 +1683,7 @@ void testNistRat43(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 27 == lm.nfev);
   VERIFY( 20 == lm.njev);
   // check norm^2
@@ -1705,7 +1705,7 @@ void testNistRat43(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 9 == lm.nfev);
   VERIFY( 8 == lm.njev);
   // check norm^2
@@ -1768,7 +1768,7 @@ void testNistEckerle4(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 18 == lm.nfev);
   VERIFY( 15 == lm.njev);
   // check norm^2
@@ -1786,7 +1786,7 @@ void testNistEckerle4(void)
   info = lm.minimize(x);
 
   // check return value
-  VERIFY( 1 == info);
+  COMPARE(info, 1);
   VERIFY( 7 == lm.nfev);
   VERIFY( 6 == lm.njev);
   // check norm^2

From 3f393490adc2e5ca705b660ffac3465e8200ff5d Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 27 Feb 2010 11:19:14 -0500
Subject: [PATCH 048/122] dot: handle the rowvector.dot(colvector) case where
 one needs to transpose.

---
 Eigen/src/Core/Dot.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 72f6c571d..9acc98eba 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -29,7 +29,15 @@
 // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
 // looking at the static assertions. Thus this is a trick to get better compile errors.
 template<typename T, typename U,
-         bool IsSameType = ei_is_same_type<typename T::Scalar, typename U::Scalar>::ret>
+         bool IsSameType = ei_is_same_type<typename T::Scalar, typename U::Scalar>::ret,
+// the NeedToTranspose condition here is taken straight from Assign.h
+         bool NeedToTranspose = T::IsVectorAtCompileTime
+                && U::IsVectorAtCompileTime
+                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
+                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                         // revert to || as soon as not needed anymore.
+                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
+>
 struct ei_dot_nocheck
 {
   static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
@@ -39,7 +47,16 @@ struct ei_dot_nocheck
 };
 
 template<typename T, typename U>
-struct ei_dot_nocheck<T, U, false>
+struct ei_dot_nocheck<T, U, true, true>
+{
+  static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  {
+    return a.adjoint().cwiseProduct(b).sum();
+  }
+};
+
+template<typename T, typename U, bool NeedToTranspose>
+struct ei_dot_nocheck<T, U, false, NeedToTranspose>
 {
   static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>&, const MatrixBase<U>&)
   {

From 78b2c7e16e2ee0e3ef12f9d203f5d34f9e4cb72e Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sat, 27 Feb 2010 17:24:42 +0100
Subject: [PATCH 049/122] Fixed a typo.

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 18e913b0e..237f5ca17 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -75,8 +75,8 @@ struct ei_gebp_kernel
         if(nr==4) C7 = ei_ploadu(&res[(j2+3)*resStride + i + PacketSize]);
 
         // performs "inner" product
-        // TODO let's check wether the flowing peeled loop could not be
-        //      optimized via optimal prefetching from one loop to the other
+        // TODO let's check whether the flowing peeled loop could not be
+        //      optimized via optimal pre-fetching from one loop to the other
         const Scalar* blB = &blockB[j2*strideB*PacketSize+offsetB*nr];
         for(int k=0; k<peeled_kc; k+=4)
         {
@@ -184,6 +184,7 @@ struct ei_gebp_kernel
             if(nr==4) { CJMADD(A1,B2,C6,T1); }
             if(nr==4) { CJMADD(A0,B3,C3,T0); }
             if(nr==4) { CJMADD(A1,B3,C7,T1); }
+
           }
 
           blB += 4*nr*PacketSize;

From 6c9eb36222dc144e73a577b836b133bc598fb145 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sat, 27 Feb 2010 17:25:07 +0100
Subject: [PATCH 050/122] Added support for realloc based conservative
 resizing.

---
 Eigen/src/Core/DenseStorageBase.h |  66 +++++++++++++------
 Eigen/src/Core/Matrix.h           |   3 +
 Eigen/src/Core/MatrixStorage.h    |  38 +++++++----
 Eigen/src/Core/util/Memory.h      | 105 +++++++++++++++++++++++++++++-
 4 files changed, 178 insertions(+), 34 deletions(-)

diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index 12ffd2e43..c7f903c7a 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -530,11 +530,21 @@ struct ei_conservative_resize_like_impl
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
-    typename Derived::PlainObject tmp(rows,cols);
-    const int common_rows = std::min(rows, _this.rows());
-    const int common_cols = std::min(cols, _this.cols());
-    tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
-    _this.derived().swap(tmp);
+
+    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+    {
+      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      typename Derived::PlainObject tmp(rows,cols);
+      const int common_rows = std::min(rows, _this.rows());
+      const int common_cols = std::min(cols, _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -549,11 +559,26 @@ struct ei_conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    typename Derived::PlainObject tmp(other);
-    const int common_rows = std::min(tmp.rows(), _this.rows());
-    const int common_cols = std::min(tmp.cols(), _this.cols());
-    tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
-    _this.derived().swap(tmp);
+    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    {
+      const int new_rows = other.rows() - _this.rows();
+      const int new_cols = other.cols() - _this.cols();
+      _this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols());
+      if (new_rows>0)
+        _this.corner(BottomRight, new_rows, other.cols()) = other.corner(BottomRight, new_rows, other.cols());
+      else if (new_cols>0)
+        _this.corner(BottomRight, other.rows(), new_cols) = other.corner(BottomRight, other.rows(), new_cols);
+    }
+    else
+    {
+      // The storage order does not allow us to use reallocation.
+      typename Derived::PlainObject tmp(other);
+      const int common_rows = std::min(tmp.rows(), _this.rows());
+      const int common_cols = std::min(tmp.cols(), _this.cols());
+      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      _this.derived().swap(tmp);
+    }
   }
 };
 
@@ -562,22 +587,23 @@ struct ei_conservative_resize_like_impl<Derived,OtherDerived,true>
 {
   static void run(DenseBase<Derived>& _this, int size)
   {
-    if (_this.size() == size) return;
-    typename Derived::PlainObject tmp(size);
-    const int common_size = std::min<int>(_this.size(),size);
-    tmp.segment(0,common_size) = _this.segment(0,common_size);
-    _this.derived().swap(tmp);
+    const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
+    const int new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
+    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
   {
     if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
 
-    // segment(...) will check whether Derived/OtherDerived are vectors!
-    typename Derived::PlainObject tmp(other);
-    const int common_size = std::min<int>(_this.size(),tmp.size());
-    tmp.segment(0,common_size) = _this.segment(0,common_size);
-    _this.derived().swap(tmp);
+    const int num_new_elements = other.size() - _this.size();
+
+    const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
+    const int new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
+    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    
+    if (num_new_elements > 0)
+      _this.tail(num_new_elements) = other.tail(num_new_elements);
   }
 };
 
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index dc1be9ea2..e7422457c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -334,6 +334,9 @@ class Matrix
     #endif
 
   protected:
+    template <typename Derived, typename OtherDerived, bool IsVector>
+    friend struct ei_conservative_resize_like_impl;
+
     using Base::m_storage;
 };
 
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index 046670452..3303b2663 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -3,6 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -92,6 +93,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class ei_matr
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); }
     inline static int rows(void) {return _Rows;}
     inline static int cols(void) {return _Cols;}
+    inline void conservativeResize(int,int,int) {}
     inline void resize(int,int,int) {}
     inline const T *data() const { return m_data.array; }
     inline T *data() { return m_data.array; }
@@ -107,6 +109,7 @@ template<typename T, int _Rows, int _Cols, int _Options> class ei_matrix_storage
     inline void swap(ei_matrix_storage& ) {}
     inline static int rows(void) {return _Rows;}
     inline static int cols(void) {return _Cols;}
+    inline void conservativeResize(int,int,int) {}
     inline void resize(int,int,int) {}
     inline const T *data() const { return 0; }
     inline T *data() { return 0; }
@@ -127,11 +130,8 @@ template<typename T, int Size, int _Options> class ei_matrix_storage<T, Size, Dy
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return m_cols;}
-    inline void resize(int, int rows, int cols)
-    {
-      m_rows = rows;
-      m_cols = cols;
-    }
+    inline void conservativeResize(int, int rows, int cols) { m_rows = rows; m_cols = cols; }
+    inline void resize(int, int rows, int cols) { m_rows = rows; m_cols = cols; }
     inline const T *data() const { return m_data.array; }
     inline T *data() { return m_data.array; }
 };
@@ -149,10 +149,8 @@ template<typename T, int Size, int _Cols, int _Options> class ei_matrix_storage<
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return _Cols;}
-    inline void resize(int /*size*/, int rows, int)
-    {
-      m_rows = rows;
-    }
+    inline void conservativeResize(int, int rows, int) { m_rows = rows; }
+    inline void resize(int, int rows, int) { m_rows = rows; }
     inline const T *data() const { return m_data.array; }
     inline T *data() { return m_data.array; }
 };
@@ -170,10 +168,8 @@ template<typename T, int Size, int _Rows, int _Options> class ei_matrix_storage<
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return _Rows;}
     inline int cols(void) const {return m_cols;}
-    inline void resize(int, int, int cols)
-    {
-      m_cols = cols;
-    }
+    inline void conservativeResize(int, int, int cols) { m_cols = cols; }
+    inline void resize(int, int, int cols) { m_cols = cols; }
     inline const T *data() const { return m_data.array; }
     inline T *data() { return m_data.array; }
 };
@@ -196,6 +192,12 @@ template<typename T, int _Options> class ei_matrix_storage<T, Dynamic, Dynamic,
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     inline int rows(void) const {return m_rows;}
     inline int cols(void) const {return m_cols;}
+    inline void conservativeResize(int size, int rows, int cols)
+    {
+      m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
+      m_rows = rows;
+      m_cols = cols;
+    }
     void resize(int size, int rows, int cols)
     {
       if(size != m_rows*m_cols)
@@ -228,6 +230,11 @@ template<typename T, int _Rows, int _Options> class ei_matrix_storage<T, Dynamic
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     inline static int rows(void) {return _Rows;}
     inline int cols(void) const {return m_cols;}
+    inline void conservativeResize(int size, int, int cols)
+    {
+      m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
+      m_cols = cols;
+    }
     void resize(int size, int, int cols)
     {
       if(size != _Rows*m_cols)
@@ -259,6 +266,11 @@ template<typename T, int _Cols, int _Options> class ei_matrix_storage<T, Dynamic
     inline void swap(ei_matrix_storage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     inline int rows(void) const {return m_rows;}
     inline static int cols(void) {return _Cols;}
+    inline void conservativeResize(int size, int rows, int)
+    {
+      m_data = ei_conditional_aligned_realloc_new<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
+      m_rows = rows;
+    }
     void resize(int size, int rows, int)
     {
       if(size != m_rows*_Cols)
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index c7b95d334..5cab12ad3 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -4,6 +4,7 @@
 // Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
+// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
 // modify it under the terms of the GNU Lesser General Public
@@ -74,6 +75,60 @@ inline void ei_handmade_aligned_free(void *ptr)
     std::free(*(reinterpret_cast<void**>(ptr) - 1));
 }
 
+inline void* ei_handmade_aligned_realloc(void* ptr, size_t size)
+{
+  // 0. Handle corner cases according to the standard
+  if (ptr!=0 && size==0)
+  {
+    ei_handmade_aligned_free(ptr);
+    return NULL;
+  }
+
+  if (ptr==0) return ei_handmade_aligned_malloc(size);
+
+  // 1. compute the original base address
+  // 2. compute the new reallocated address
+  // 3. compute the aligned address and store the original one
+  void *base = *(reinterpret_cast<void**>(ptr) - 1);  
+  void *original = std::realloc(base, size+16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
+  *(reinterpret_cast<void**>(aligned) - 1) = original;
+  return aligned;
+}
+
+#if EIGEN_HAS_MM_MALLOC
+void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
+{
+  // 0. Check if size==0 and act according to the standard.
+  if (ptr!=0 && size==0)
+  {
+    _mm_free(ptr);
+    return NULL;
+  }
+
+  // 1. Allocate new memory
+  void* newptr = _mm_malloc(size,16);
+
+  // 2. Verify the allocation success
+  // Testing for size!=0 is important since the standard says that 
+  // for size==0, the object pointer (i.e. ptr) should be freed.
+  if (newptr == NULL) 
+  { 
+    /*errno = ENOMEM;*/ // according to the standard we should set errno = ENOMEM
+    return NULL; 
+  }
+
+  // 3. Copy the overlapping data and free the old data
+  if (ptr != NULL) 
+  {
+    std::memcpy(newptr, ptr, std::min(size,old_size));
+    _mm_free(ptr);
+  }
+
+  return newptr;
+}
+#endif
+
 /** \internal allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
   */
@@ -182,6 +237,54 @@ template<> inline void ei_conditional_aligned_free<false>(void *ptr)
   std::free(ptr);
 }
 
+inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
+{
+  (void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
+
+  void *result;
+#if !EIGEN_ALIGN
+  result = realloc(ptr,new_size);
+#elif EIGEN_MALLOC_ALREADY_ALIGNED
+  result =realloc(ptr,new_size);
+#elif EIGEN_HAS_POSIX_MEMALIGN
+  realloc(ptr,new_size);
+#elif EIGEN_HAS_MM_MALLOC
+#if defined(_MSC_VER) && defined(_mm_free)
+  result = _aligned_realloc(ptr,new_size,16);
+#else
+  result = ei_mm_realloc(ptr,new_size,old_size);
+#endif
+#elif defined(_MSC_VER)
+  result = _aligned_realloc(ptr,new_size,16);
+#else
+  result = ei_handmade_aligned_realloc(ptr,new_size);
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  if (result==0 && new_size!=0)
+    throw std::bad_alloc();
+#endif
+  return result;
+}
+
+template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+{
+  return ei_aligned_realloc(ptr, new_size, old_size);
+}
+
+template<> inline void* ei_conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
+{
+  return std::realloc(ptr, new_size);
+}
+
+template<typename T, bool Align> inline T* ei_conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+{
+  T *result = reinterpret_cast<T*>(ei_conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  if (new_size > old_size)
+    ei_construct_elements_of_array(result+old_size, new_size-old_size);
+  return result;
+}
+
 /** \internal destruct the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
@@ -236,7 +339,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   if(PacketSize==1)
   {
     // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same aligment.
+    // of the array have the same alignment.
     return 0;
   }
   else if(size_t(array) & (sizeof(Scalar)-1))

From e0830cb1b703d6eb4aa50c3f8332c11a32a7a764 Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Sat, 27 Feb 2010 17:56:22 +0100
Subject: [PATCH 051/122] Use a specialization of test_is_equal() instead of
 defining COMPARE()

---
 test/main.h                                |  18 +-
 unsupported/test/NonLinearOptimization.cpp | 258 ++++++++++-----------
 2 files changed, 140 insertions(+), 136 deletions(-)

diff --git a/test/main.h b/test/main.h
index f4cfa11c5..d4f96e3d2 100644
--- a/test/main.h
+++ b/test/main.h
@@ -157,13 +157,6 @@ namespace Eigen
     exit(2); \
   } } while (0)
 
-// Use COMPARE for exact comparison of scalar values (mostly, int)
-#define COMPARE(actual, expected) do { if (actual!=expected) { \
-    std::cerr << "Test " << g_test_stack.back() << ". Comparison failed in "EI_PP_MAKE_STRING(__FILE__) << " (" << EI_PP_MAKE_STRING(__LINE__) << ")" \
-      << std::endl << "    actual   = " << actual \
-      << std::endl << "    expected = " << expected << std::endl << std::endl; \
-    exit(2); \
-  } } while (0)
 #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
 #define VERIFY_IS_APPROX(a, b) VERIFY(test_ei_isApprox(a, b))
 #define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_ei_isApprox(a, b))
@@ -385,6 +378,17 @@ bool test_is_equal(const Derived1& a1, const Derived2& a2)
   return test_is_equal_impl<Derived1, Derived2>::run(a1, a2);
 }
 
+bool test_is_equal(const int actual, const int expected)
+{
+    if (actual==expected)
+        return true;
+    // false:
+    std::cerr
+        << std::endl << "    actual   = " << actual
+        << std::endl << "    expected = " << expected << std::endl << std::endl;
+    return false;
+}
+
 /** Creates a random Partial Isometry matrix of given rank.
   *
   * A partial isometry is a matrix all of whose singular values are either 0 or 1.
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index 38d7b7766..e68745ad1 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -172,9 +172,9 @@ void testLmder1()
   info = lm.lmder1(x);
 
   // check return value
-  COMPARE(info, 1);
-  COMPARE(lm.nfev, 6);
-  COMPARE(lm.njev, 5);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 6);
+  VERIFY_IS_EQUAL(lm.njev, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -201,9 +201,9 @@ void testLmder()
   info = lm.minimize(x);
 
   // check return values
-  COMPARE(info, 1);
-  COMPARE(lm.nfev, 6);
-  COMPARE(lm.njev, 5);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 6);
+  VERIFY_IS_EQUAL(lm.njev, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -286,9 +286,9 @@ void testHybrj1()
   info = solver.hybrj1(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY(solver.nfev==11);
-  VERIFY(solver.njev==1);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 11);
+  VERIFY_IS_EQUAL(solver.njev, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -321,9 +321,9 @@ void testHybrj()
   info = solver.solve(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY(solver.nfev==11);
-  VERIFY(solver.njev==1);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 11);
+  VERIFY_IS_EQUAL(solver.njev, 1);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -375,8 +375,8 @@ void testHybrd1()
   info = solver.hybrd1(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY(solver.nfev==20);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 20);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -406,8 +406,8 @@ void testHybrd()
   info = solver.solveNumericalDiff(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY(solver.nfev==14);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 14);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -477,9 +477,9 @@ void testLmstr1()
   info = lm.lmstr1(x);
 
   // check return value
-  COMPARE(info, 1);
-  COMPARE(lm.nfev, 6);
-  COMPARE(lm.njev, 5);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 6);
+  VERIFY_IS_EQUAL(lm.njev, 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
@@ -506,9 +506,9 @@ void testLmstr()
   info = lm.minimizeOptimumStorage(x);
 
   // check return values
-  COMPARE(info, 1);
-  COMPARE(lm.nfev, 6);
-  COMPARE(lm.njev, 5);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 6);
+  VERIFY_IS_EQUAL(lm.njev, 5);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -562,8 +562,8 @@ void testLmdif1()
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY(nfev==26);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(nfev, 26);
 
   // check norm
   functor(x, fvec);
@@ -593,8 +593,8 @@ void testLmdif()
   info = lm.minimize(x);
 
   // check return values
-  COMPARE(info, 1);
-  COMPARE(lm.nfev, 26);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 26);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -678,9 +678,9 @@ void testNistChwirut2(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 10 == lm.nfev);
-  VERIFY( 8 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 10);
+  VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -699,9 +699,9 @@ void testNistChwirut2(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 7 == lm.nfev);
-  VERIFY( 6 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 7);
+  VERIFY_IS_EQUAL(lm.njev, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
   // check x
@@ -758,9 +758,9 @@ void testNistMisra1a(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 19 == lm.nfev);
-  VERIFY( 15 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 19);
+  VERIFY_IS_EQUAL(lm.njev, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -775,9 +775,9 @@ void testNistMisra1a(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 5 == lm.nfev);
-  VERIFY( 4 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 5);
+  VERIFY_IS_EQUAL(lm.njev, 4);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
   // check x
@@ -844,19 +844,19 @@ void testNistHahn1(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 11== lm.nfev);
-  VERIFY( 10== lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 11);
+  VERIFY_IS_EQUAL(lm.njev, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
-  VERIFY_IS_APPROX(x[0], 1.0776351733E+00  );
-  VERIFY_IS_APPROX(x[1],-1.2269296921E-01  );
-  VERIFY_IS_APPROX(x[2], 4.0863750610E-03  );
+  VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
+  VERIFY_IS_APPROX(x[1],-1.2269296921E-01);
+  VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
   VERIFY_IS_APPROX(x[3],-1.426264e-06); // shoulde be : -1.4262662514E-06
-  VERIFY_IS_APPROX(x[4],-5.7609940901E-03  );
-  VERIFY_IS_APPROX(x[5], 2.4053735503E-04  );
-  VERIFY_IS_APPROX(x[6],-1.2314450199E-07  );
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
+  VERIFY_IS_APPROX(x[6],-1.2314450199E-07);
 
   /*
    * Second try
@@ -866,9 +866,9 @@ void testNistHahn1(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 11 == lm.nfev);
-  VERIFY( 10 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 11);
+  VERIFY_IS_EQUAL(lm.njev, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
   // check x
@@ -876,7 +876,7 @@ void testNistHahn1(void)
   VERIFY_IS_APPROX(x[1], -0.1226933); // should be : -1.2269296921E-01
   VERIFY_IS_APPROX(x[2], 0.004086383); // should be : 4.0863750610E-03
   VERIFY_IS_APPROX(x[3], -1.426277e-06); // shoulde be : -1.4262662514E-06
-  VERIFY_IS_APPROX(x[4],-5.7609940901E-03  );
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
   VERIFY_IS_APPROX(x[5], 0.00024053772); // should be : 2.4053735503E-04
   VERIFY_IS_APPROX(x[6], -1.231450e-07); // should be : -1.2314450199E-07
 
@@ -930,9 +930,9 @@ void testNistMisra1d(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 3);
-  VERIFY( 9 == lm.nfev);
-  VERIFY( 7 == lm.njev);
+  VERIFY_IS_EQUAL(info, 3);
+  VERIFY_IS_EQUAL(lm.nfev, 9);
+  VERIFY_IS_EQUAL(lm.njev, 7);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -947,9 +947,9 @@ void testNistMisra1d(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 4 == lm.nfev);
-  VERIFY( 3 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 4);
+  VERIFY_IS_EQUAL(lm.njev, 3);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
   // check x
@@ -1008,18 +1008,18 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 2);
-  VERIFY( 79 == lm.nfev);
-  VERIFY( 72 == lm.njev);
+  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(lm.nfev, 79);
+  VERIFY_IS_EQUAL(lm.njev, 72);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.430899764097e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
   // check x
-  VERIFY_IS_APPROX(x[0], 9.5100000027E-02 );
-  VERIFY_IS_APPROX(x[1], 1.0000000001E+00 );
-  VERIFY_IS_APPROX(x[2], 8.6070000013E-01 );
-  VERIFY_IS_APPROX(x[3], 3.0000000002E+00 );
-  VERIFY_IS_APPROX(x[4], 1.5575999998E+00 );
-  VERIFY_IS_APPROX(x[5], 5.0000000001E+00 );
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
 
   /*
    * Second try
@@ -1029,18 +1029,18 @@ void testNistLanczos1(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 2);
-  VERIFY( 9 == lm.nfev);
-  VERIFY( 8 == lm.njev);
+  VERIFY_IS_EQUAL(info, 2);
+  VERIFY_IS_EQUAL(lm.nfev, 9);
+  VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.428595533845e-25);  // should be 1.4307867721E-25, but nist results are on 128-bit floats
   // check x
-  VERIFY_IS_APPROX(x[0], 9.5100000027E-02 );
-  VERIFY_IS_APPROX(x[1], 1.0000000001E+00 );
-  VERIFY_IS_APPROX(x[2], 8.6070000013E-01 );
-  VERIFY_IS_APPROX(x[3], 3.0000000002E+00 );
-  VERIFY_IS_APPROX(x[4], 1.5575999998E+00 );
-  VERIFY_IS_APPROX(x[5], 5.0000000001E+00 );
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
 
 }
 
@@ -1094,9 +1094,9 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 10 == lm.nfev);
-  VERIFY( 8 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 10);
+  VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1112,9 +1112,9 @@ void testNistRat42(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 6 == lm.nfev);
-  VERIFY( 5 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 6);
+  VERIFY_IS_EQUAL(lm.njev, 5);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
   // check x
@@ -1172,9 +1172,9 @@ void testNistMGH10(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 2); 
-  VERIFY( 284 == lm.nfev); 
-  VERIFY( 249 == lm.njev); 
+  VERIFY_IS_EQUAL(info, 2); 
+  VERIFY_IS_EQUAL(lm.nfev, 284 ); 
+  VERIFY_IS_EQUAL(lm.njev, 249 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1190,9 +1190,9 @@ void testNistMGH10(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 3);
-  VERIFY( 126 == lm.nfev);
-  VERIFY( 116 == lm.njev);
+  VERIFY_IS_EQUAL(info, 3);
+  VERIFY_IS_EQUAL(lm.nfev, 126);
+  VERIFY_IS_EQUAL(lm.njev, 116);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
   // check x
@@ -1251,9 +1251,9 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 31 == lm.nfev);
-  VERIFY( 25  == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 31);
+  VERIFY_IS_EQUAL(lm.njev, 25);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1271,9 +1271,9 @@ void testNistBoxBOD(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1); 
-  VERIFY( 15 == lm.nfev); 
-  VERIFY( 14 == lm.njev); 
+  VERIFY_IS_EQUAL(info, 1); 
+  VERIFY_IS_EQUAL(lm.nfev, 15 ); 
+  VERIFY_IS_EQUAL(lm.njev, 14 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1333,9 +1333,9 @@ void testNistMGH17(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 2); 
-  VERIFY( 602 == lm.nfev); 
-  VERIFY( 545 == lm.njev); 
+  VERIFY_IS_EQUAL(info, 2); 
+  VERIFY_IS_EQUAL(lm.nfev, 602 ); 
+  VERIFY_IS_EQUAL(lm.njev, 545 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1354,9 +1354,9 @@ void testNistMGH17(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 18 == lm.nfev);
-  VERIFY( 15 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 18);
+  VERIFY_IS_EQUAL(lm.njev, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1420,9 +1420,9 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1); 
-  VERIFY( 490 == lm.nfev); 
-  VERIFY( 376 == lm.njev); 
+  VERIFY_IS_EQUAL(info, 1); 
+  VERIFY_IS_EQUAL(lm.nfev, 490 ); 
+  VERIFY_IS_EQUAL(lm.njev, 376 ); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1440,9 +1440,9 @@ void testNistMGH09(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 18 == lm.nfev);
-  VERIFY( 16 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 18);
+  VERIFY_IS_EQUAL(lm.njev, 16);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1503,9 +1503,9 @@ void testNistBennett5(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 758 == lm.nfev);
-  VERIFY( 744 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 758);
+  VERIFY_IS_EQUAL(lm.njev, 744);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1521,9 +1521,9 @@ void testNistBennett5(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 203 == lm.nfev);
-  VERIFY( 192 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 203);
+  VERIFY_IS_EQUAL(lm.njev, 192);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1591,9 +1591,9 @@ void testNistThurber(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 39 == lm.nfev);
-  VERIFY( 36== lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 39);
+  VERIFY_IS_EQUAL(lm.njev, 36);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1616,9 +1616,9 @@ void testNistThurber(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 29 == lm.nfev);
-  VERIFY( 28 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 29);
+  VERIFY_IS_EQUAL(lm.njev, 28);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1683,9 +1683,9 @@ void testNistRat43(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 27 == lm.nfev);
-  VERIFY( 20 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 27);
+  VERIFY_IS_EQUAL(lm.njev, 20);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1705,9 +1705,9 @@ void testNistRat43(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 9 == lm.nfev);
-  VERIFY( 8 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 9);
+  VERIFY_IS_EQUAL(lm.njev, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1768,9 +1768,9 @@ void testNistEckerle4(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 18 == lm.nfev);
-  VERIFY( 15 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 18);
+  VERIFY_IS_EQUAL(lm.njev, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1786,9 +1786,9 @@ void testNistEckerle4(void)
   info = lm.minimize(x);
 
   // check return value
-  COMPARE(info, 1);
-  VERIFY( 7 == lm.nfev);
-  VERIFY( 6 == lm.njev);
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 7);
+  VERIFY_IS_EQUAL(lm.njev, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
   // check x

From 22fabb8940a15fd8d8d359fefc9c24a7a226a4c1 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 27 Feb 2010 17:51:48 -0500
Subject: [PATCH 052/122] add missing inline keyword, thanks to Eamon.

---
 Eigen/src/Core/util/Memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 5cab12ad3..5739f7ec8 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -97,7 +97,7 @@ inline void* ei_handmade_aligned_realloc(void* ptr, size_t size)
 }
 
 #if EIGEN_HAS_MM_MALLOC
-void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
+inline void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
 {
   // 0. Check if size==0 and act according to the standard.
   if (ptr!=0 && size==0)

From e84f7e07e9ef5a4f5200c7ca6a501df95bcc188e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 27 Feb 2010 18:57:07 -0500
Subject: [PATCH 053/122] add ei_posix_memalign_realloc

---
 Eigen/src/Core/util/Memory.h | 61 ++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 5739f7ec8..b9c879e70 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -99,27 +99,26 @@ inline void* ei_handmade_aligned_realloc(void* ptr, size_t size)
 #if EIGEN_HAS_MM_MALLOC
 inline void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
 {
-  // 0. Check if size==0 and act according to the standard.
-  if (ptr!=0 && size==0)
+  // 0. Check if size==0 and act according to the standard, which says that
+  // for size==0, the object pointer (i.e. ptr) should be freed.
+  if (size==0)
   {
     _mm_free(ptr);
-    return NULL;
+    return 0;
   }
 
   // 1. Allocate new memory
   void* newptr = _mm_malloc(size,16);
 
   // 2. Verify the allocation success
-  // Testing for size!=0 is important since the standard says that 
-  // for size==0, the object pointer (i.e. ptr) should be freed.
-  if (newptr == NULL) 
+  if (newptr == 0) 
   { 
     /*errno = ENOMEM;*/ // according to the standard we should set errno = ENOMEM
-    return NULL; 
+    return 0; 
   }
 
   // 3. Copy the overlapping data and free the old data
-  if (ptr != NULL) 
+  if (ptr != 0) 
   {
     std::memcpy(newptr, ptr, std::min(size,old_size));
     _mm_free(ptr);
@@ -127,7 +126,37 @@ inline void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
 
   return newptr;
 }
-#endif
+#endif // EIGEN_HAS_MM_MALLOC
+
+#if EIGEN_HAS_POSIX_MEMALIGN
+inline void* ei_posix_memalign_realloc(void *ptr, size_t size, size_t old_size)
+{
+  // 0. Check if size==0 and act according to the standard, which says that
+  // for size==0, the object pointer (i.e. ptr) should be freed.
+  if (size==0)
+  {
+    free(ptr);
+    return 0;
+  }
+
+  // 1. Allocate new memory and verify the allocation success
+  void *newptr;
+  if(posix_memalign(&newptr, 16, size))
+  {
+    /*errno = ENOMEM;*/ // according to the standard we should set errno = ENOMEM
+    return 0;
+  }
+
+  // 2. Copy the overlapping data and free the old data
+  if (ptr != 0)
+  {
+    std::memcpy(newptr, ptr, std::min(size,old_size));
+    free(ptr);
+  }
+
+  return newptr;
+}
+#endif // EIGEN_HAS_POSIX_MEMALIGN
 
 /** \internal allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
@@ -245,15 +274,15 @@ inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 #if !EIGEN_ALIGN
   result = realloc(ptr,new_size);
 #elif EIGEN_MALLOC_ALREADY_ALIGNED
-  result =realloc(ptr,new_size);
+  result = realloc(ptr,new_size);
 #elif EIGEN_HAS_POSIX_MEMALIGN
-  realloc(ptr,new_size);
+  result = ei_posix_memalign_realloc(ptr,new_size,old_size);
 #elif EIGEN_HAS_MM_MALLOC
-#if defined(_MSC_VER) && defined(_mm_free)
-  result = _aligned_realloc(ptr,new_size,16);
-#else
-  result = ei_mm_realloc(ptr,new_size,old_size);
-#endif
+  #if defined(_MSC_VER) && defined(_mm_free)
+    result = _aligned_realloc(ptr,new_size,16);
+  #else
+    result = ei_mm_realloc(ptr,new_size,old_size);
+  #endif
 #elif defined(_MSC_VER)
   result = _aligned_realloc(ptr,new_size,16);
 #else

From 27f52502581b559e5804b4b6dd42774791d2ca1c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 27 Feb 2010 19:04:22 -0500
Subject: [PATCH 054/122] Only include <iosfwd> unless either
 EIGEN_DEBUG_ASSIGN is defined or we're in eigen2 support mode

---
 Eigen/Core                   | 7 ++++++-
 Eigen/Eigen2Support          | 6 +++++-
 Eigen/src/Core/Assign.h      | 2 ++
 test/vectorization_logic.cpp | 1 +
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 1af04a7ee..3abf6e254 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -119,13 +119,18 @@
 #include <complex>
 #include <cassert>
 #include <functional>
-#include <iostream>
+#include <iosfwd>
 #include <cstring>
 #include <string>
 #include <limits>
 // for min/max:
 #include <algorithm>
 
+// for outputting debug info
+#ifdef EIGEN_DEBUG_ASSIGN
+#include<iostream>
+#endif
+
 #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_NO_EXCEPTIONS)
   #define EIGEN_EXCEPTIONS
 #endif
diff --git a/Eigen/Eigen2Support b/Eigen/Eigen2Support
index bd6306aff..26b547b9b 100644
--- a/Eigen/Eigen2Support
+++ b/Eigen/Eigen2Support
@@ -26,7 +26,7 @@
 #define EIGEN2SUPPORT_H
 
 #if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
-#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before any other Eigen header
+#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
 #endif
 
 #include "src/Core/util/DisableMSVCWarnings.h"
@@ -36,6 +36,7 @@ namespace Eigen {
 /** \defgroup Eigen2Support_Module Eigen2 support module
   * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
   *
+  * To use it, define EIGEN2_SUPPORT before including any Eigen header
   * \code
   * #define EIGEN2_SUPPORT
   * \endcode
@@ -51,4 +52,7 @@ namespace Eigen {
 
 #include "src/Core/util/EnableMSVCWarnings.h"
 
+// Eigen2 used to include iostream
+#include<iostream>
+
 #endif // EIGEN2SUPPORT_H
diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h
index f806ba572..eb7bca1da 100644
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@@ -105,6 +105,7 @@ public:
               : int(NoUnrolling)
   };
 
+#ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
     EIGEN_DEBUG_VAR(DstIsAligned)
@@ -125,6 +126,7 @@ public:
     EIGEN_DEBUG_VAR(MayUnrollInner)
     EIGEN_DEBUG_VAR(Unrolling)
   }
+#endif
 };
 
 /***************************************************************************
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index ae9911831..94a8a5c96 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -22,6 +22,7 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
+#define EIGEN_DEBUG_ASSIGN
 #include "main.h"
 #include <typeinfo>
 

From eb3a3351cca47cb8fe13855786938c8f2cae4217 Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Sun, 28 Feb 2010 02:51:35 +0100
Subject: [PATCH 055/122] misc cleaning

---
 .../NonLinearOptimization/HybridNonLinearSolver.h    |  8 ++++++++
 .../src/NonLinearOptimization/LevenbergMarquardt.h   | 12 +++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index 35dc332e0..d75b1407c 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -194,6 +194,8 @@ template<typename FunctorType, typename Scalar>
 HybridNonLinearSolverSpace::Status
 HybridNonLinearSolver<FunctorType,Scalar>::solveOneStep(FVectorType  &x)
 {
+    assert(x.size()==n); // check the caller is not cheating us
+
     int j;
     std::vector<PlanarRotation<Scalar> > v_givens(n), w_givens(n);
 
@@ -350,6 +352,8 @@ HybridNonLinearSolverSpace::Status
 HybridNonLinearSolver<FunctorType,Scalar>::solve(FVectorType  &x)
 {
     HybridNonLinearSolverSpace::Status status = solveInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
     while (status==HybridNonLinearSolverSpace::Running)
         status = solveOneStep(x);
     return status;
@@ -429,6 +433,8 @@ template<typename FunctorType, typename Scalar>
 HybridNonLinearSolverSpace::Status
 HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffOneStep(FVectorType  &x)
 {
+    assert(x.size()==n); // check the caller is not cheating us
+
     int j;
     std::vector<PlanarRotation<Scalar> > v_givens(n), w_givens(n);
 
@@ -587,6 +593,8 @@ HybridNonLinearSolverSpace::Status
 HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiff(FVectorType  &x)
 {
     HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
+    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+        return status;
     while (status==HybridNonLinearSolverSpace::Running)
         status = solveNumericalDiffOneStep(x);
     return status;
diff --git a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index 8bae1e131..f99366bbc 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -161,6 +161,8 @@ LevenbergMarquardtSpace::Status
 LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x)
 {
     LevenbergMarquardtSpace::Status status = minimizeInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
     do {
         status = minimizeOneStep(x);
     } while (status==LevenbergMarquardtSpace::Running);
@@ -214,7 +216,7 @@ template<typename FunctorType, typename Scalar>
 LevenbergMarquardtSpace::Status
 LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
 {
-    int j;
+    assert(x.size()==n); // check the caller is not cheating us
 
     /* calculate the jacobian matrix. */
     int df_ret = functor.df(x, fjac);
@@ -235,7 +237,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
     /* to the norms of the columns of the initial jacobian. */
     if (iter == 1) {
         if (!useExternalScaling)
-            for (j = 0; j < n; ++j)
+            for (int j = 0; j < n; ++j)
                 diag[j] = (wa2[j]==0.)? 1. : wa2[j];
 
         /* on the first iteration, calculate the norm of the scaled x */
@@ -255,7 +257,7 @@ LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
     /* compute the norm of the scaled gradient. */
     gnorm = 0.;
     if (fnorm != 0.)
-        for (j = 0; j < n; ++j)
+        for (int j = 0; j < n; ++j)
             if (wa2[permutation.indices()[j]] != 0.)
                 gnorm = std::max(gnorm, ei_abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
 
@@ -431,6 +433,8 @@ template<typename FunctorType, typename Scalar>
 LevenbergMarquardtSpace::Status
 LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageOneStep(FVectorType  &x)
 {
+    assert(x.size()==n); // check the caller is not cheating us
+
     int i, j;
     bool sing;
 
@@ -606,6 +610,8 @@ LevenbergMarquardtSpace::Status
 LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorage(FVectorType  &x)
 {
     LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
+    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+        return status;
     do {
         status = minimizeOptimumStorageOneStep(x);
     } while (status==LevenbergMarquardtSpace::Running);

From 1d9c18a8f3795e160577198ee79d77fa4052bbce Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sun, 28 Feb 2010 00:53:06 -0500
Subject: [PATCH 056/122] comment out cerr's

---
 Eigen/src/Eigenvalues/ComplexSchur.h   | 2 +-
 Eigen/src/Sparse/CholmodSupport.h      | 2 +-
 Eigen/src/Sparse/DynamicSparseMatrix.h | 2 +-
 Eigen/src/Sparse/SuperLUSupport.h      | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h
index 531ebf709..c45151e82 100644
--- a/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -191,7 +191,7 @@ void ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool skipU)
     if(iter >= 30)
     {
       // FIXME : what to do when iter==MAXITER ??
-      std::cerr << "MAXITER" << std::endl;
+      //std::cerr << "MAXITER" << std::endl;
       return;
     }
 
diff --git a/Eigen/src/Sparse/CholmodSupport.h b/Eigen/src/Sparse/CholmodSupport.h
index 248f56533..fbd035ce4 100644
--- a/Eigen/src/Sparse/CholmodSupport.h
+++ b/Eigen/src/Sparse/CholmodSupport.h
@@ -233,7 +233,7 @@ bool SparseLLT<MatrixType,Cholmod>::solveInPlace(MatrixBase<Derived> &b) const
   cholmod_dense* x = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &cdb, &m_cholmod);
   if(!x)
   {
-    std::cerr << "Eigen: cholmod_solve failed\n";
+    //std::cerr << "Eigen: cholmod_solve failed\n";
     return false;
   }
   b = Matrix<typename Base::Scalar,Dynamic,1>::Map(reinterpret_cast<typename Base::Scalar*>(x->x),b.rows());
diff --git a/Eigen/src/Sparse/DynamicSparseMatrix.h b/Eigen/src/Sparse/DynamicSparseMatrix.h
index 2594ffebc..d73dce229 100644
--- a/Eigen/src/Sparse/DynamicSparseMatrix.h
+++ b/Eigen/src/Sparse/DynamicSparseMatrix.h
@@ -236,7 +236,7 @@ class DynamicSparseMatrix
       {
         // remove all coefficients with innerCoord>=innerSize
         // TODO
-        std::cerr << "not implemented yet\n";
+        //std::cerr << "not implemented yet\n";
         exit(2);
       }
       if (m_data.size() != outerSize)
diff --git a/Eigen/src/Sparse/SuperLUSupport.h b/Eigen/src/Sparse/SuperLUSupport.h
index 9a5bec554..18a967539 100644
--- a/Eigen/src/Sparse/SuperLUSupport.h
+++ b/Eigen/src/Sparse/SuperLUSupport.h
@@ -397,7 +397,7 @@ void SparseLU<MatrixType,SuperLU>::compute(const MatrixType& a)
       case MinimumDegree_ATA        : m_sluOptions.ColPerm = MMD_ATA; break;
       case ColApproxMinimumDegree   : m_sluOptions.ColPerm = COLAMD; break;
       default:
-        std::cerr << "Eigen: ordering method \"" << Base::orderingMethod() << "\" not supported by the SuperLU backend\n";
+        //std::cerr << "Eigen: ordering method \"" << Base::orderingMethod() << "\" not supported by the SuperLU backend\n";
         m_sluOptions.ColPerm = NATURAL;
   };
 
@@ -448,7 +448,7 @@ void SparseLU<MatrixType,SuperLU>::compute(const MatrixType& a)
       &recip_pivot_gross, &rcond,
       &m_sluStat, &info, Scalar());
     #else
-    std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
+    //std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
     Base::m_succeeded = false;
     return;
     #endif
@@ -486,7 +486,7 @@ bool SparseLU<MatrixType,SuperLU>::solve(const MatrixBase<BDerived> &b,
       case SvTranspose  :  m_sluOptions.Trans = TRANS;   break;
       case SvAdjoint    :  m_sluOptions.Trans = CONJ;    break;
       default:
-        std::cerr << "Eigen: transposition  option \"" << transposed << "\" not supported by the SuperLU backend\n";
+        //std::cerr << "Eigen: transposition  option \"" << transposed << "\" not supported by the SuperLU backend\n";
         m_sluOptions.Trans = NOTRANS;
   }
 
@@ -513,7 +513,7 @@ bool SparseLU<MatrixType,SuperLU>::solve(const MatrixBase<BDerived> &b,
       &recip_pivot_gross, &rcond,
       &m_sluStat, &info, Scalar());
     #else
-    std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
+    //std::cerr << "Incomplete factorization is only available in SuperLU v4\n";
     return false;
     #endif
   }

From 40bd69fbaa4260ea55f67d714bc4fbc3d90e1aae Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sun, 28 Feb 2010 12:56:37 +0100
Subject: [PATCH 057/122] Hide some internal stuff from the docs.

---
 Eigen/src/Core/util/XprHelper.h | 2 +-
 Eigen/src/LU/PartialPivLU.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index eceb5ab2a..a09475e20 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -214,7 +214,7 @@ struct ei_is_reference<T&>
 };
 
 /**
-* The reference selector for template expressions. The idea is that we don't
+* \internal The reference selector for template expressions. The idea is that we don't
 * need to use references for expressions since they are light weight proxy
 * objects which should generate no copying overhead.
 **/
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index a60596668..df36cb04d 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -196,7 +196,7 @@ PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
   compute(matrix);
 }
 
-/** This is the blocked version of ei_fullpivlu_unblocked() */
+/** \internal This is the blocked version of ei_fullpivlu_unblocked() */
 template<typename Scalar, int StorageOrder>
 struct ei_partial_lu_impl
 {

From ff8c2149c1d5874430c0bba14c8f8e61ed0e4fec Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sun, 28 Feb 2010 14:32:57 +0100
Subject: [PATCH 058/122] Added a generic reallocation implementation based on
 ei_aligned_malloc/_free. Rewrote ei_handmade_aligned_realloc such that it is
 now using std::realloc. Reorganized functions in Memory.h for better
 readability. Add missing <cerrno> include to Core (it's now required in
 Memory.h).

---
 Eigen/Core                   |   1 +
 Eigen/src/Core/util/Memory.h | 315 +++++++++++++++++------------------
 2 files changed, 152 insertions(+), 164 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 3abf6e254..f984a96c6 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -114,6 +114,7 @@
   #endif
 #endif
 
+#include <cerrno>
 #include <cstdlib>
 #include <cmath>
 #include <complex>
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index b9c879e70..1e9d31624 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -56,109 +56,83 @@
   #define EIGEN_HAS_MM_MALLOC 0
 #endif
 
-/** \internal like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
-  * Fast, but wastes 16 additional bytes of memory.
-  * Does not throw any exception.
+
+// Forward declarations required for the implementation
+// of ei_handmade_aligned_realloc.
+void* ei_aligned_malloc(size_t size);
+void  ei_aligned_free(void *ptr);
+
+/* ----- Hand made implementations of aligned malloc/free and realloc ----- */
+
+/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
+  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
   */
 inline void* ei_handmade_aligned_malloc(size_t size)
 {
   void *original = std::malloc(size+16);
+  if (original == 0) return 0;
   void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
-/** \internal frees memory allocated with ei_handmade_aligned_malloc */
+/** \internal Frees memory allocated with ei_handmade_aligned_malloc */
 inline void ei_handmade_aligned_free(void *ptr)
 {
-  if(ptr)
-    std::free(*(reinterpret_cast<void**>(ptr) - 1));
+  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
 }
 
-inline void* ei_handmade_aligned_realloc(void* ptr, size_t size)
+/** \internal 
+  * \brief Reallocates aligned memory. 
+  * Since we know that our handmade version is based on std::realloc
+  * we can use std::realloc to implement efficient reallocation.
+  */
+inline void* ei_handmade_aligned_realloc(void* ptr, size_t size, size_t)
 {
-  // 0. Handle corner cases according to the standard
-  if (ptr!=0 && size==0)
-  {
-    ei_handmade_aligned_free(ptr);
-    return NULL;
-  }
-
-  if (ptr==0) return ei_handmade_aligned_malloc(size);
-
-  // 1. compute the original base address
-  // 2. compute the new reallocated address
-  // 3. compute the aligned address and store the original one
-  void *base = *(reinterpret_cast<void**>(ptr) - 1);  
-  void *original = std::realloc(base, size+16);
+  if (ptr == 0) return ei_handmade_aligned_malloc(size);
+  void *original = *(reinterpret_cast<void**>(ptr) - 1);
+  original = std::realloc(ptr,size+16);
+  if (original == 0) return 0;
   void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
-#if EIGEN_HAS_MM_MALLOC
-inline void* ei_mm_realloc(void *ptr, size_t size, size_t old_size)
+/** \internal 
+  * \brief Reallocates aligned memory.
+  * Allows reallocation with aligned ptr types. This implementation will
+  * always create a new memory chunk and copy the old data. 
+  */
+inline void* ei_generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
 {
-  // 0. Check if size==0 and act according to the standard, which says that
-  // for size==0, the object pointer (i.e. ptr) should be freed.
+  if (ptr==0)
+    return ei_aligned_malloc(size);
+
   if (size==0)
   {
-    _mm_free(ptr);
+    ei_aligned_free(ptr);
     return 0;
   }
 
-  // 1. Allocate new memory
-  void* newptr = _mm_malloc(size,16);
-
-  // 2. Verify the allocation success
+  void* newptr = ei_aligned_malloc(size);
   if (newptr == 0) 
   { 
-    /*errno = ENOMEM;*/ // according to the standard we should set errno = ENOMEM
-    return 0; 
+    errno = ENOMEM; // according to the standard
+    return 0;
   }
 
-  // 3. Copy the overlapping data and free the old data
   if (ptr != 0) 
   {
     std::memcpy(newptr, ptr, std::min(size,old_size));
-    _mm_free(ptr);
+    ei_aligned_free(ptr);
   }
 
   return newptr;
 }
-#endif // EIGEN_HAS_MM_MALLOC
 
-#if EIGEN_HAS_POSIX_MEMALIGN
-inline void* ei_posix_memalign_realloc(void *ptr, size_t size, size_t old_size)
-{
-  // 0. Check if size==0 and act according to the standard, which says that
-  // for size==0, the object pointer (i.e. ptr) should be freed.
-  if (size==0)
-  {
-    free(ptr);
-    return 0;
-  }
+/* --- Eigen internal implementations of aligned malloc/free and realloc --- */
 
-  // 1. Allocate new memory and verify the allocation success
-  void *newptr;
-  if(posix_memalign(&newptr, 16, size))
-  {
-    /*errno = ENOMEM;*/ // according to the standard we should set errno = ENOMEM
-    return 0;
-  }
-
-  // 2. Copy the overlapping data and free the old data
-  if (ptr != 0)
-  {
-    std::memcpy(newptr, ptr, std::min(size,old_size));
-    free(ptr);
-  }
-
-  return newptr;
-}
-#endif // EIGEN_HAS_POSIX_MEMALIGN
-
-/** \internal allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
   */
 inline void* ei_aligned_malloc(size_t size)
@@ -189,7 +163,65 @@ inline void* ei_aligned_malloc(size_t size)
   return result;
 }
 
-/** allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
+/** \internal Frees memory allocated with ei_aligned_malloc. */
+inline void ei_aligned_free(void *ptr)
+{
+  #if !EIGEN_ALIGN
+    std::free(ptr);
+  #elif EIGEN_MALLOC_ALREADY_ALIGNED
+    std::free(ptr);
+  #elif EIGEN_HAS_POSIX_MEMALIGN
+    std::free(ptr);
+  #elif EIGEN_HAS_MM_MALLOC
+    _mm_free(ptr);
+  #elif defined(_MSC_VER)
+    _aligned_free(ptr);
+  #else
+    ei_handmade_aligned_free(ptr);
+  #endif
+}
+
+/**
+* \internal 
+* \brief Reallocates an aligned block of memory.
+* \throws std::bad_alloc if EIGEN_EXCEPTIONS are defined.
+**/
+inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
+{
+  (void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
+
+  void *result;
+#if !EIGEN_ALIGN
+  result = std::realloc(ptr,new_size);
+#elif EIGEN_MALLOC_ALREADY_ALIGNED
+  result = std::realloc(ptr,new_size);
+#elif EIGEN_HAS_POSIX_MEMALIGN
+  result = ei_generic_aligned_realloc(ptr,new_size,old_size);
+#elif EIGEN_HAS_MM_MALLOC
+  // The defined(_mm_free) is just here to verify that this MSVC version
+  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
+  // functions. This may not always be the case and we just try to be safe.
+#if defined(_MSC_VER) && defined(_mm_free)
+  result = _aligned_realloc(ptr,new_size,16);
+#else
+  result = ei_generic_aligned_realloc(ptr,new_size,old_size);
+#endif
+#elif defined(_MSC_VER)
+  result = _aligned_realloc(ptr,new_size,16);
+#else
+  result = ei_handmade_aligned_realloc(ptr,new_size,old_size);
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  if (result==0 && new_size!=0)
+    throw std::bad_alloc();
+#endif
+  return result;
+}
+
+/* ---- Conditional implementations of aligned malloc/free and realloc ---- */
+
+/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
   */
 template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
@@ -210,7 +242,30 @@ template<> inline void* ei_conditional_aligned_malloc<false>(size_t size)
   return result;
 }
 
-/** \internal construct the elements of an array.
+/** \internal Frees memory allocated with ei_conditional_aligned_malloc */
+template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
+{
+  ei_aligned_free(ptr);
+}
+
+template<> inline void ei_conditional_aligned_free<false>(void *ptr)
+{
+  std::free(ptr);
+}
+
+template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
+{
+  return ei_aligned_realloc(ptr, new_size, old_size);
+}
+
+template<> inline void* ei_conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
+{
+  return std::realloc(ptr, new_size);
+}
+
+/* ---------- Eigen internal memory management of array elements --------- */
+
+/** \internal Constructs the elements of an array.
   * The \a size parameter tells on how many objects to call the constructor of T.
   */
 template<typename T> inline T* ei_construct_elements_of_array(T *ptr, size_t size)
@@ -219,7 +274,18 @@ template<typename T> inline T* ei_construct_elements_of_array(T *ptr, size_t siz
   return ptr;
 }
 
-/** allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Destructs the elements of an array.
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T> inline void ei_destruct_elements_of_array(T *ptr, size_t size)
+{
+  // always destruct an array starting from the end.
+  while(size) ptr[--size].~T();
+}
+
+/* -- Memory management of arrays (allocation & in-place creation of elements) -- */
+
+/** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
@@ -235,75 +301,22 @@ template<typename T, bool Align> inline T* ei_conditional_aligned_new(size_t siz
   return ei_construct_elements_of_array(result, size);
 }
 
-/** \internal free memory allocated with ei_aligned_malloc
+/** \internal Deletes objects constructed with ei_aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
   */
-inline void ei_aligned_free(void *ptr)
-{
-  #if !EIGEN_ALIGN
-    free(ptr);
-  #elif EIGEN_MALLOC_ALREADY_ALIGNED
-    free(ptr);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    free(ptr);
-  #elif EIGEN_HAS_MM_MALLOC
-    _mm_free(ptr);
-  #elif defined(_MSC_VER)
-    _aligned_free(ptr);
-  #else
-    ei_handmade_aligned_free(ptr);
-  #endif
-}
-
-/** \internal free memory allocated with ei_conditional_aligned_malloc
-  */
-template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
+template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
 {
+  ei_destruct_elements_of_array<T>(ptr, size);
   ei_aligned_free(ptr);
 }
 
-template<> inline void ei_conditional_aligned_free<false>(void *ptr)
+/** \internal Deletes objects constructed with ei_conditional_aligned_new
+  * The \a size parameters tells on how many objects to call the destructor of T.
+  */
+template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *ptr, size_t size)
 {
-  std::free(ptr);
-}
-
-inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
-{
-  (void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
-
-  void *result;
-#if !EIGEN_ALIGN
-  result = realloc(ptr,new_size);
-#elif EIGEN_MALLOC_ALREADY_ALIGNED
-  result = realloc(ptr,new_size);
-#elif EIGEN_HAS_POSIX_MEMALIGN
-  result = ei_posix_memalign_realloc(ptr,new_size,old_size);
-#elif EIGEN_HAS_MM_MALLOC
-  #if defined(_MSC_VER) && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,16);
-  #else
-    result = ei_mm_realloc(ptr,new_size,old_size);
-  #endif
-#elif defined(_MSC_VER)
-  result = _aligned_realloc(ptr,new_size,16);
-#else
-  result = ei_handmade_aligned_realloc(ptr,new_size);
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  if (result==0 && new_size!=0)
-    throw std::bad_alloc();
-#endif
-  return result;
-}
-
-template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
-{
-  return ei_aligned_realloc(ptr, new_size, old_size);
-}
-
-template<> inline void* ei_conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
-{
-  return std::realloc(ptr, new_size);
+  ei_destruct_elements_of_array<T>(ptr, size);
+  ei_conditional_aligned_free<Align>(ptr);
 }
 
 template<typename T, bool Align> inline T* ei_conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
@@ -314,34 +327,8 @@ template<typename T, bool Align> inline T* ei_conditional_aligned_realloc_new(T*
   return result;
 }
 
-/** \internal destruct the elements of an array.
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T> inline void ei_destruct_elements_of_array(T *ptr, size_t size)
-{
-  // always destruct an array starting from the end.
-  while(size) ptr[--size].~T();
-}
 
-/** \internal delete objects constructed with ei_aligned_new
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
-{
-  ei_destruct_elements_of_array<T>(ptr, size);
-  ei_aligned_free(ptr);
-}
-
-/** \internal delete objects constructed with ei_conditional_aligned_new
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T, bool Align> inline void ei_conditional_aligned_delete(T *ptr, size_t size)
-{
-  ei_destruct_elements_of_array<T>(ptr, size);
-  ei_conditional_aligned_free<Align>(ptr);
-}
-
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
   *
   * \param array the address of the start of the array
   * \param size the size of the array
@@ -385,11 +372,11 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 }
 
 /** \internal
-  * ei_aligned_stack_alloc(SIZE) allocates an aligned buffer of SIZE bytes
-  * on the stack if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and
-  * if stack allocation is supported by the platform (currently, this is linux only).
-  * Otherwise the memory is allocated on the heap.
-  * Data allocated with ei_aligned_stack_alloc \b must be freed by calling ei_aligned_stack_free(PTR,SIZE).
+  * Allocates an aligned buffer of SIZE bytes on the stack if SIZE is smaller than 
+  * EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform 
+  * (currently, this is Linux only). Otherwise the memory is allocated on the heap.
+  * Data allocated with ei_aligned_stack_alloc \b must be freed by calling 
+  * ei_aligned_stack_free(PTR,SIZE).
   * \code
   * float * data = ei_aligned_stack_alloc(float,array.size());
   * // ...
@@ -457,7 +444,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 
 /** \class aligned_allocator
 *
-* \brief stl compatible allocator to use with with 16 byte aligned types
+* \brief STL compatible allocator to use with with 16 byte aligned types
 *
 * Example:
 * \code

From a480e7e60fa00fb4ee9bccd1277808c339243604 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sun, 28 Feb 2010 09:10:41 -0500
Subject: [PATCH 059/122] * fix ei_handmade_aligned_realloc (was calling
 realloc on wrong ptr) * add missing std::  (at least for QNX compatibility) *
 add big comments to "structure" the file

---
 Eigen/src/Core/util/Memory.h | 64 ++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 1e9d31624..98f88d171 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -25,6 +25,11 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
+
+/*****************************************************************************
+*** Platform checks for aligned malloc functions                           ***
+*****************************************************************************/
+
 #ifndef EIGEN_MEMORY_H
 #define EIGEN_MEMORY_H
 
@@ -56,11 +61,9 @@
   #define EIGEN_HAS_MM_MALLOC 0
 #endif
 
-
-// Forward declarations required for the implementation
-// of ei_handmade_aligned_realloc.
-void* ei_aligned_malloc(size_t size);
-void  ei_aligned_free(void *ptr);
+/*****************************************************************************
+*** Implementation of handmade aligned functions                           ***
+*****************************************************************************/
 
 /* ----- Hand made implementations of aligned malloc/free and realloc ----- */
 
@@ -87,17 +90,24 @@ inline void ei_handmade_aligned_free(void *ptr)
   * Since we know that our handmade version is based on std::realloc
   * we can use std::realloc to implement efficient reallocation.
   */
-inline void* ei_handmade_aligned_realloc(void* ptr, size_t size, size_t)
+inline void* ei_handmade_aligned_realloc(void* ptr, size_t size, size_t = 0)
 {
   if (ptr == 0) return ei_handmade_aligned_malloc(size);
   void *original = *(reinterpret_cast<void**>(ptr) - 1);
-  original = std::realloc(ptr,size+16);
+  original = std::realloc(original,size+16);
   if (original == 0) return 0;
   void *aligned = reinterpret_cast<void*>((reinterpret_cast<size_t>(original) & ~(size_t(15))) + 16);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
+/*****************************************************************************
+*** Implementation of generic aligned realloc (when no realloc can be used)***
+*****************************************************************************/
+
+void* ei_aligned_malloc(size_t size);
+void  ei_aligned_free(void *ptr);
+
 /** \internal 
   * \brief Reallocates aligned memory.
   * Allows reallocation with aligned ptr types. This implementation will
@@ -130,7 +140,9 @@ inline void* ei_generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
   return newptr;
 }
 
-/* --- Eigen internal implementations of aligned malloc/free and realloc --- */
+/*****************************************************************************
+*** Implementation of portable aligned versions of malloc/free/realloc     ***
+*****************************************************************************/
 
 /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
@@ -143,9 +155,9 @@ inline void* ei_aligned_malloc(size_t size)
 
   void *result;
   #if !EIGEN_ALIGN
-    result = malloc(size);
+    result = std::malloc(size);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
-    result = malloc(size);
+    result = std::malloc(size);
   #elif EIGEN_HAS_POSIX_MEMALIGN
     if(posix_memalign(&result, 16, size)) result = 0;
   #elif EIGEN_HAS_MM_MALLOC
@@ -201,11 +213,11 @@ inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   // The defined(_mm_free) is just here to verify that this MSVC version
   // implements _mm_malloc/_mm_free based on the corresponding _aligned_
   // functions. This may not always be the case and we just try to be safe.
-#if defined(_MSC_VER) && defined(_mm_free)
-  result = _aligned_realloc(ptr,new_size,16);
-#else
-  result = ei_generic_aligned_realloc(ptr,new_size,old_size);
-#endif
+  #if defined(_MSC_VER) && defined(_mm_free)
+    result = _aligned_realloc(ptr,new_size,16);
+  #else
+    result = ei_generic_aligned_realloc(ptr,new_size,old_size);
+  #endif
 #elif defined(_MSC_VER)
   result = _aligned_realloc(ptr,new_size,16);
 #else
@@ -219,7 +231,9 @@ inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   return result;
 }
 
-/* ---- Conditional implementations of aligned malloc/free and realloc ---- */
+/*****************************************************************************
+*** Implementation of conditionally aligned functions                      ***
+*****************************************************************************/
 
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and if exceptions are enabled then a std::bad_alloc is thrown.
@@ -263,7 +277,9 @@ template<> inline void* ei_conditional_aligned_realloc<false>(void* ptr, size_t
   return std::realloc(ptr, new_size);
 }
 
-/* ---------- Eigen internal memory management of array elements --------- */
+/*****************************************************************************
+*** Construction/destruction of array elements                             ***
+*****************************************************************************/
 
 /** \internal Constructs the elements of an array.
   * The \a size parameter tells on how many objects to call the constructor of T.
@@ -283,7 +299,9 @@ template<typename T> inline void ei_destruct_elements_of_array(T *ptr, size_t si
   while(size) ptr[--size].~T();
 }
 
-/* -- Memory management of arrays (allocation & in-place creation of elements) -- */
+/*****************************************************************************
+*** Implementation of aligned new/delete-like functions                    ***
+*****************************************************************************/
 
 /** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
   * On allocation error, the returned pointer is undefined, but if exceptions are enabled then a std::bad_alloc is thrown.
@@ -327,6 +345,7 @@ template<typename T, bool Align> inline T* ei_conditional_aligned_realloc_new(T*
   return result;
 }
 
+/****************************************************************************/
 
 /** \internal Returns the index of the first element of the array that is well aligned for vectorization.
   *
@@ -371,6 +390,10 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   }
 }
 
+/*****************************************************************************
+*** Implementation of runtime stack allocation (falling back to malloc)    ***
+*****************************************************************************/
+
 /** \internal
   * Allocates an aligned buffer of SIZE bytes on the stack if SIZE is smaller than 
   * EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform 
@@ -398,6 +421,10 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
                                                    ei_aligned_stack_free(PTR,sizeof(TYPE)*SIZE);} while(0)
 
 
+/*****************************************************************************
+*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
+*****************************************************************************/
+
 #if EIGEN_ALIGN
   #ifdef EIGEN_EXCEPTIONS
     #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
@@ -441,6 +468,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
   EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0))
 
+/****************************************************************************/
 
 /** \class aligned_allocator
 *

From 9334ed444483825bb784135b94b20a483e838292 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sun, 28 Feb 2010 10:10:53 -0500
Subject: [PATCH 060/122] on 64-bit systems, glibc's malloc returns 16-byte
 aligned ptrs, and we now take advantage of that.

---
 Eigen/src/Core/util/Memory.h | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 98f88d171..db4a79b8a 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -33,23 +33,41 @@
 #ifndef EIGEN_MEMORY_H
 #define EIGEN_MEMORY_H
 
-// FreeBSD 6 seems to have 16-byte aligned malloc
-// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
-// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
-// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
-#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+// On 64-bit systems, glibc's malloc returns 16-byte-aligned pointers, see:
+//   http://www.gnu.org/s/libc/manual/html_node/Aligned-Memory-Blocks.html
+// This is true at least since glibc 2.8.
+// This leaves the question how to detect 64-bit. According to this document,
+//   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
+// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
+// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
+ && __LP64__
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
-#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if defined(__APPLE__) || defined(_WIN64) || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
+// FreeBSD 6 seems to have 16-byte aligned malloc
+//   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
+// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
+//   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
+#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+#else
+  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+#endif
+
+#if defined(__APPLE__) \
+ || defined(_WIN64) \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
+ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
   #define EIGEN_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if ((defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
+#if ((defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \
+ && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
   #define EIGEN_HAS_POSIX_MEMALIGN 1
 #else
   #define EIGEN_HAS_POSIX_MEMALIGN 0

From 07023b94d89d06bb9eb6930902e70bc2f13f9fcf Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sun, 28 Feb 2010 10:11:28 -0500
Subject: [PATCH 061/122] forgot defined(...)

---
 Eigen/src/Core/util/Memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index db4a79b8a..fbb1ef4d6 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -41,7 +41,7 @@
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && __LP64__
+ && defined(__LP64__)
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0

From f1f3c30ddc0e957a0165ae197d6c61b0ee9f5cf2 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sun, 28 Feb 2010 11:10:13 -0500
Subject: [PATCH 062/122] remove the hack to make the static assertion on types
 actually show up. indeed, now that we use the meta selector for transposing
 as needed, the static asserts work very well.

---
 Eigen/src/Core/Dot.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 9acc98eba..fbdc67bd3 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -29,7 +29,6 @@
 // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
 // looking at the static assertions. Thus this is a trick to get better compile errors.
 template<typename T, typename U,
-         bool IsSameType = ei_is_same_type<typename T::Scalar, typename U::Scalar>::ret,
 // the NeedToTranspose condition here is taken straight from Assign.h
          bool NeedToTranspose = T::IsVectorAtCompileTime
                 && U::IsVectorAtCompileTime
@@ -47,7 +46,7 @@ struct ei_dot_nocheck
 };
 
 template<typename T, typename U>
-struct ei_dot_nocheck<T, U, true, true>
+struct ei_dot_nocheck<T, U, true>
 {
   static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
@@ -55,15 +54,6 @@ struct ei_dot_nocheck<T, U, true, true>
   }
 };
 
-template<typename T, typename U, bool NeedToTranspose>
-struct ei_dot_nocheck<T, U, false, NeedToTranspose>
-{
-  static inline typename ei_traits<T>::Scalar run(const MatrixBase<T>&, const MatrixBase<U>&)
-  {
-    return typename ei_traits<T>::Scalar(0);
-  }
-};
-
 /** \returns the dot product of *this with other.
   *
   * \only_for_vectors

From aeff3ff391958734b5e1b2411f4ac5fdc30da08b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 1 Mar 2010 10:57:32 +0100
Subject: [PATCH 063/122] make Aron's idea work using Qt's atomic
 implementation for the synchronisation

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 30 ++++++----
 Eigen/src/Core/products/Parallelizer.h        | 18 +++---
 bench/bench_gemm.cpp                          | 55 +++++++++++++------
 3 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index b6123ca8b..418ed720f 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -107,12 +107,14 @@ static void run(int rows, int cols, int depth,
     // (==GEMM_VAR1)
     for(int k=0; k<depth; k+=kc)
     {
-      // pack B_k to B' in parallel fashion,
-      // each thread packs the B_k,j sub block to B'_j where j is the thread id
+      // Pack B_k to B' in parallel fashion,
+      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
 
-      // TODO before copying to B'_j, makes sure that no other threads are using it!
-      // currently done using a barrier
-      #pragma omp barrier
+
+      // Before copying to B'_j, we have to make sure that no other thread is still using it,
+      // i.e., we test that info[tid].users equals 0.
+      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
+      while(!info[tid].users.testAndSetOrdered(0,threads)) {}
 
       const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A'
 
@@ -122,7 +124,9 @@ static void run(int rows, int cols, int depth,
       sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
       #endif
 
-#if 0
+      // mark that the parts B'_j is uptodate and can be used.
+      info[tid].sync.fetchAndStoreOrdered(k);
+
       // this is an attempt to implement a smarter strategy as suggested by Aron
       // the layout is good, but there is no synchronization yet
       {
@@ -138,17 +142,16 @@ static void run(int rows, int cols, int depth,
         {
           int j = (tid+shift)%threads;
 
-          // TODO here we have to makes sure that thread j is done with packing B'_j
+          // At this point we have to make sure that B'_j has been updated by the thread j,
+          // we use testAndSetOrdered to mimic a volatile integer
+          while(!info[j].sync.testAndSetOrdered(k,k)) {}
+
           sgemm_kernel(actual_mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride);
         }
       }
 
       // then keep going as usual with the remaining A'
       for(int i=mc; i<rows; i+=mc)
-#else
-      #pragma omp barrier
-      for(int i=0; i<rows; i+=mc)
-#endif
       {
         const int actual_mc = std::min(i+mc,rows)-i;
 
@@ -166,6 +169,11 @@ static void run(int rows, int cols, int depth,
         sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
         #endif
       }
+
+      // Release all the sub blocks B'_j of B' for the current thread,
+      // i.e., we simply decrement the number of users by 1
+      for(int j=0; j<threads; ++j)
+        info[j].users.fetchAndAddOrdered(-1);
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index ad998572b..620a5e8ba 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,13 +25,6 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-struct GemmParallelInfo
-{
-  int rhs_start;
-  int rhs_length;
-  float* blockB;
-};
-
 template<bool Parallelize,typename Functor>
 void ei_run_parallel_1d(const Functor& func, int size)
 {
@@ -97,6 +90,15 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 #endif
 }
 
+struct GemmParallelInfo
+{
+  QAtomicInt sync;
+  QAtomicInt users;
+  int rhs_start;
+  int rhs_length;
+  float* blockB;
+};
+
 template<bool Parallelize,typename Functor>
 void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
 {
@@ -128,6 +130,8 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
     info[i].rhs_start = c0;
     info[i].rhs_length = actualBlockCols;
     info[i].blockB = sharedBlockB;
+    info[i].sync.fetchAndStoreOrdered(-1);
+    info[i].users.fetchAndStoreOrdered(0);
 
     func(r0, actualBlockRows, 0,cols, info);
   }
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index c7a3db619..12df7bcbc 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,6 +2,8 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
+#include <QAtomicInt>
+
 #include <Eigen/Core>
 
 #include <bench/BenchTimer.h>
@@ -69,10 +71,10 @@ void gemm(const M& a, const M& b, M& c)
 
 int main(int argc, char ** argv)
 {
-  int rep = 2048;    // number of repetitions per try
+  int rep = 1;    // number of repetitions per try
   int tries = 5;  // number of tries, we keep the best
 
-  int s = 512;
+  int s = 2048;
   int m = s;
   int n = s;
   int p = s;
@@ -80,31 +82,48 @@ int main(int argc, char ** argv)
   M b(n,p); b.setRandom();
   M c(m,p); c.setOnes();
 
-  BenchTimer t;
-
   M r = c;
 
   // check the parallel product is correct
-  #ifdef HAVE_BLAS
-  blas_gemm(a,b,r);
-  #else
+  #ifdef EIGEN_HAS_OPENMP
   int procs = omp_get_max_threads();
-  omp_set_num_threads(1);
-  r.noalias() += a * b;
-  omp_set_num_threads(procs);
+  if(procs>1)
+  {
+    #ifdef HAVE_BLAS
+    blas_gemm(a,b,r);
+    #else
+    omp_set_num_threads(1);
+    r.noalias() += a * b;
+    omp_set_num_threads(procs);
+    #endif
+    c.noalias() += a * b;
+    if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
+  }
   #endif
-  c.noalias() += a * b;
-  if(!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
 
   #ifdef HAVE_BLAS
-  BENCH(t, tries, rep, blas_gemm(a,b,c));
-  std::cerr << "blas  cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "blas  real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  BenchTimer tblas;
+  BENCH(tblas, tries, rep, blas_gemm(a,b,c));
+  std::cout << "blas  cpu         " << tblas.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tblas.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tblas.total(CPU_TIMER)  << "s)\n";
+  std::cout << "blas  real        " << tblas.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tblas.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tblas.total(REAL_TIMER) << "s)\n";
   #endif
 
-  BENCH(t, tries, rep, gemm(a,b,c));
-  std::cerr << "eigen cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "eigen real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
+  BenchTimer tmt;
+  BENCH(tmt, tries, rep, gemm(a,b,c));
+  std::cout << "eigen cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
+  std::cout << "eigen real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
+
+  #ifdef EIGEN_HAS_OPENMP
+  if(procs>1)
+  {
+    BenchTimer tmono;
+    omp_set_num_threads(1);
+    BENCH(tmono, tries, rep, gemm(a,b,c));
+    std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
+    std::cout << "eigen mono real   " << tmono.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmono.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmono.total(REAL_TIMER) << "s)\n";
+    std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER)  << " => " << (100.0*tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER))/procs << "%\n";
+  }
+  #endif
 
   return 0;
 }

From 31aa17e4efafa25a5f9e27a3ba02b5ca030ad3f5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 1 Mar 2010 11:10:30 +0100
Subject: [PATCH 064/122] GEMM: move the first packing of A' before the packing
 of B'

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 51 ++++++++++---------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 418ed720f..da700f8b7 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -107,50 +107,55 @@ static void run(int rows, int cols, int depth,
     // (==GEMM_VAR1)
     for(int k=0; k<depth; k+=kc)
     {
-      // Pack B_k to B' in parallel fashion,
+      const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A'
+
+      // In order to reduce the chance that a thread has to wait for the other,
+      // let's start by packing A'.
+      #ifndef USEGOTOROUTINES
+      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      #else
+      sgemm_itcopy(actual_kc, mc, &lhs(0,k), lhsStride, blockA);
+      #endif
+
+
+      // Pack B_k to B' in parallel fashion:
       // each thread packs the sub block B_k,j to B'_j where j is the thread id.
 
 
-      // Before copying to B'_j, we have to make sure that no other thread is still using it,
+      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(!info[tid].users.testAndSetOrdered(0,threads)) {}
 
-      const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A'
-
       #ifndef USEGOTOROUTINES
       pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
       #else
       sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
       #endif
 
-      // mark that the parts B'_j is uptodate and can be used.
+      // Notify the other threads that the part B'_j is ready to go.
       info[tid].sync.fetchAndStoreOrdered(k);
 
-      // this is an attempt to implement a smarter strategy as suggested by Aron
-      // the layout is good, but there is no synchronization yet
+      // Computes C_i += A' * B' per B'_j
+      for(int shift=0; shift<threads; ++shift)
       {
-        const int actual_mc = mc;
+        int j = (tid+shift)%threads;
 
-        // pack to A'
-        pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, actual_mc);
-
-        // use our current thread's B' part right away, no need to wait for the other threads
-        sgemm_kernel(actual_mc, info[tid].rhs_length, actual_kc, alpha, blockA, blockB+info[tid].rhs_start*kc, res+info[tid].rhs_start*resStride, resStride);
-
-        for(int shift=1; shift<threads; ++shift)
-        {
-          int j = (tid+shift)%threads;
-
-          // At this point we have to make sure that B'_j has been updated by the thread j,
-          // we use testAndSetOrdered to mimic a volatile integer
+        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // we use testAndSetOrdered to mimic a volatile access.
+        // However, no need to wait for the B' part which has been updated by the current thread!
+        if(shift>0)
           while(!info[j].sync.testAndSetOrdered(k,k)) {}
 
-          sgemm_kernel(actual_mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride);
-        }
+        #ifndef USEGOTOROUTINES
+        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
+        #else
+        sgemm_kernel(mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride);
+        #endif
+
       }
 
-      // then keep going as usual with the remaining A'
+      // Then keep going as usual with the remaining A'
       for(int i=mc; i<rows; i+=mc)
       {
         const int actual_mc = std::min(i+mc,rows)-i;

From 2d7bd1ec9124ec4e1145321626426ca7ea2e6a3b Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Mon, 1 Mar 2010 12:05:57 +0000
Subject: [PATCH 065/122] Make MatrixFunctions tests more robust. * Use
 absolute error instead of relative error. * Test on well-conditioned
 matrices. * Do not repeat the same test g_repeat times (bug fix). * Correct
 diagnostic output in matrix_exponential.cpp .

---
 unsupported/test/matrix_exponential.cpp |  2 +-
 unsupported/test/matrix_function.cpp    | 72 ++++++++++++++-----------
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp
index 86e942edb..61f30334d 100644
--- a/unsupported/test/matrix_exponential.cpp
+++ b/unsupported/test/matrix_exponential.cpp
@@ -133,7 +133,7 @@ void randomTest(const MatrixType& m, double tol)
     m1 = MatrixType::Random(rows, cols);
 
     m2 = ei_matrix_function(m1, expfn) * ei_matrix_function(-m1, expfn);
-    std::cout << "randomTest: error funm = " << relerr(identity, m2 * m3);
+    std::cout << "randomTest: error funm = " << relerr(identity, m2);
     VERIFY(identity.isApprox(m2, static_cast<RealScalar>(tol)));
 
     m2 = ei_matrix_exponential(m1) * ei_matrix_exponential(-m1);
diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp
index 7a1501da2..e40af7b4e 100644
--- a/unsupported/test/matrix_function.cpp
+++ b/unsupported/test/matrix_function.cpp
@@ -25,6 +25,17 @@
 #include "main.h"
 #include <unsupported/Eigen/MatrixFunctions>
 
+// Variant of VERIFY_IS_APPROX which uses absolute error instead of
+// relative error.
+#define VERIFY_IS_APPROX_ABS(a, b) VERIFY(test_isApprox_abs(a, b))
+
+template<typename Type1, typename Type2>
+inline bool test_isApprox_abs(const Type1& a, const Type2& b)
+{
+  return ((a-b).array().abs() < test_precision<typename Type1::RealScalar>()).all();
+}
+
+
 // Returns a matrix with eigenvalues clustered around 0, 1 and 2.
 template<typename MatrixType>
 MatrixType randomMatrixWithRealEivals(const int size)
@@ -37,7 +48,8 @@ MatrixType randomMatrixWithRealEivals(const int size)
       + ei_random<Scalar>() * Scalar(RealScalar(0.01));
   }
   MatrixType A = MatrixType::Random(size, size);
-  return A.inverse() * diag * A;
+  HouseholderQR<MatrixType> QRofA(A);
+  return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
 }
 
 template <typename MatrixType, int IsComplex = NumTraits<typename ei_traits<MatrixType>::Scalar>::IsComplex>
@@ -69,7 +81,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0>
       }
     }
     MatrixType A = MatrixType::Random(size, size);
-    return A.inverse() * diag * A;
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
   }
 };
 
@@ -88,10 +101,12 @@ struct randomMatrixWithImagEivals<MatrixType, 1>
         + ei_random<Scalar>() * Scalar(RealScalar(0.01));
     }
     MatrixType A = MatrixType::Random(size, size);
-    return A.inverse() * diag * A;
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
   }
 };
 
+
 template<typename MatrixType>
 void testMatrixExponential(const MatrixType& A)
 {
@@ -99,50 +114,45 @@ void testMatrixExponential(const MatrixType& A)
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef std::complex<RealScalar> ComplexScalar;
 
-  for (int i = 0; i < g_repeat; i++) {
-    VERIFY_IS_APPROX(ei_matrix_exponential(A), 
-		     ei_matrix_function(A, StdStemFunctions<ComplexScalar>::exp));
-  }
+  VERIFY_IS_APPROX(ei_matrix_exponential(A), 
+		   ei_matrix_function(A, StdStemFunctions<ComplexScalar>::exp));
 }
 
 template<typename MatrixType>
 void testHyperbolicFunctions(const MatrixType& A)
 {
-  for (int i = 0; i < g_repeat; i++) {
-    MatrixType expA = ei_matrix_exponential(A);
-    MatrixType expmA = ei_matrix_exponential(-A);
-    VERIFY_IS_APPROX(ei_matrix_sinh(A), (expA - expmA) / 2);
-    VERIFY_IS_APPROX(ei_matrix_cosh(A), (expA + expmA) / 2);
-  }
+  // Need to use absolute error because of possible cancellation when
+  // adding/subtracting expA and expmA.
+  MatrixType expA = ei_matrix_exponential(A);
+  MatrixType expmA = ei_matrix_exponential(-A);
+  VERIFY_IS_APPROX_ABS(ei_matrix_sinh(A), (expA - expmA) / 2);
+  VERIFY_IS_APPROX_ABS(ei_matrix_cosh(A), (expA + expmA) / 2);
 }
 
 template<typename MatrixType>
 void testGonioFunctions(const MatrixType& A)
 {
-  typedef ei_traits<MatrixType> Traits;
-  typedef typename Traits::Scalar Scalar;
+  typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef std::complex<RealScalar> ComplexScalar;
-  typedef Matrix<ComplexScalar, Traits::RowsAtCompileTime, 
-                 Traits::ColsAtCompileTime, MatrixType::Options> ComplexMatrix;
+  typedef Matrix<ComplexScalar, MatrixType::RowsAtCompileTime, 
+                 MatrixType::ColsAtCompileTime, MatrixType::Options> ComplexMatrix;
 
   ComplexScalar imagUnit(0,1);
   ComplexScalar two(2,0);
 
-  for (int i = 0; i < g_repeat; i++) {
-    ComplexMatrix Ac = A.template cast<ComplexScalar>();
-
-    ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
-    ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac);
-
-    MatrixType sinA = ei_matrix_sin(A);
-    ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
-    VERIFY_IS_APPROX(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
-
-    MatrixType cosA = ei_matrix_cos(A);
-    ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
-    VERIFY_IS_APPROX(cosAc, (exp_iA + exp_miA) / 2);
-  }
+  ComplexMatrix Ac = A.template cast<ComplexScalar>();
+  
+  ComplexMatrix exp_iA = ei_matrix_exponential(imagUnit * Ac);
+  ComplexMatrix exp_miA = ei_matrix_exponential(-imagUnit * Ac);
+  
+  MatrixType sinA = ei_matrix_sin(A);
+  ComplexMatrix sinAc = sinA.template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
+  
+  MatrixType cosA = ei_matrix_cos(A);
+  ComplexMatrix cosAc = cosA.template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(cosAc, (exp_iA + exp_miA) / 2);
 }
 
 template<typename MatrixType>

From 1710c07f63aa4be8d3ef11e2b4977ce7fe545948 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 1 Mar 2010 13:09:47 +0100
Subject: [PATCH 066/122] remove Qt's atomic dependency, I don't know what I
 was doing wrong...

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 11 ++++++-----
 Eigen/src/Core/products/Parallelizer.h        | 11 ++++++-----
 bench/bench_gemm.cpp                          |  2 --
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index da700f8b7..cf42855eb 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -98,7 +98,6 @@ static void run(int rows, int cols, int depth,
 
     // if you have the GOTO blas library you can try our parallelization strategy
     // using GOTO's optimized routines.
-//     #define USEGOTOROUTINES
     #ifdef USEGOTOROUTINES
     void* u = alloca(4096+sizeW);
     #endif
@@ -125,7 +124,8 @@ static void run(int rows, int cols, int depth,
       // However, before copying to B'_j, we have to make sure that no other thread is still using it,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
-      while(!info[tid].users.testAndSetOrdered(0,threads)) {}
+      while(info[tid].users!=0) {}
+      info[tid].users += threads;
 
       #ifndef USEGOTOROUTINES
       pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
@@ -134,7 +134,7 @@ static void run(int rows, int cols, int depth,
       #endif
 
       // Notify the other threads that the part B'_j is ready to go.
-      info[tid].sync.fetchAndStoreOrdered(k);
+      info[tid].sync = k;
 
       // Computes C_i += A' * B' per B'_j
       for(int shift=0; shift<threads; ++shift)
@@ -145,7 +145,7 @@ static void run(int rows, int cols, int depth,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
         if(shift>0)
-          while(!info[j].sync.testAndSetOrdered(k,k)) {}
+          while(info[j].sync!=k) {}
 
         #ifndef USEGOTOROUTINES
         gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
@@ -178,7 +178,8 @@ static void run(int rows, int cols, int depth,
       // Release all the sub blocks B'_j of B' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(int j=0; j<threads; ++j)
-        info[j].users.fetchAndAddOrdered(-1);
+        #pragma omp atomic
+        --(info[j].users);
     }
 
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 620a5e8ba..404b8d390 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -92,8 +92,11 @@ void ei_run_parallel_2d(const Functor& func, int size1, int size2)
 
 struct GemmParallelInfo
 {
-  QAtomicInt sync;
-  QAtomicInt users;
+  GemmParallelInfo() : sync(-1), users(0) {}
+
+  int volatile sync;
+  int volatile users;
+
   int rhs_start;
   int rhs_length;
   float* blockB;
@@ -118,7 +121,7 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
 
   GemmParallelInfo* info = new GemmParallelInfo[threads];
 
-  #pragma omp parallel for schedule(static,1)
+  #pragma omp parallel for schedule(static,1) shared(info)
   for(int i=0; i<threads; ++i)
   {
     int r0 = i*blockRows;
@@ -130,8 +133,6 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
     info[i].rhs_start = c0;
     info[i].rhs_length = actualBlockCols;
     info[i].blockB = sharedBlockB;
-    info[i].sync.fetchAndStoreOrdered(-1);
-    info[i].users.fetchAndStoreOrdered(0);
 
     func(r0, actualBlockRows, 0,cols, info);
   }
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 12df7bcbc..653a880a8 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,8 +2,6 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
-#include <QAtomicInt>
-
 #include <Eigen/Core>
 
 #include <bench/BenchTimer.h>

From 65eba35f98941a1d5c7ff6f854ed17224ef65b40 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 1 Mar 2010 13:34:44 +0100
Subject: [PATCH 067/122] rm useless omp shared directive

---
 Eigen/src/Core/products/Parallelizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 404b8d390..439ce1565 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -121,7 +121,7 @@ void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
 
   GemmParallelInfo* info = new GemmParallelInfo[threads];
 
-  #pragma omp parallel for schedule(static,1) shared(info)
+  #pragma omp parallel for schedule(static,1)
   for(int i=0; i<threads; ++i)
   {
     int r0 = i*blockRows;

From a1ac56a7c783843ef312afff186a95024a9cc3c0 Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Mon, 1 Mar 2010 13:46:41 +0000
Subject: [PATCH 068/122] Add (failing) test for computing HouseholderQR of a
 1x1 matrix.

---
 test/qr.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/qr.cpp b/test/qr.cpp
index f2393c13b..1ce1f46e6 100644
--- a/test/qr.cpp
+++ b/test/qr.cpp
@@ -117,6 +117,7 @@ void test_qr()
    CALL_SUBTEST_3(( qr_fixedsize<Matrix<float,3,4>, 2 >() ));
    CALL_SUBTEST_4(( qr_fixedsize<Matrix<double,6,2>, 4 >() ));
    CALL_SUBTEST_5(( qr_fixedsize<Matrix<double,2,5>, 7 >() ));
+   CALL_SUBTEST_11( qr(Matrix<float,1,1>()) );
   }
 
   for(int i = 0; i < g_repeat; i++) {

From a7b9250ad04fe02f9c51085164478bc1687577f3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 1 Mar 2010 19:06:07 +0100
Subject: [PATCH 069/122] blas interface: fix compilation, fix GEMM, SYMM,
 TRMM, and TRSM, i,e., they all pass the blas test suite. More to come

---
 blas/CMakeLists.txt |   3 +-
 blas/common.h       |  49 ++++++-----
 blas/level1_impl.h  |  16 ++--
 blas/level3_impl.h  | 204 +++++++++++++++++++++++++++-----------------
 4 files changed, 159 insertions(+), 113 deletions(-)

diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index a6c330a5c..ee67fe519 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -4,7 +4,8 @@ add_custom_target(blas)
 
 set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp)
 
-add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+add_library(eigen_blas ${EigenBlas_SRCS})
+# add_library(eigen_blas SHARED ${EigenBlas_SRCS})
 add_dependencies(blas eigen_blas)
 
 install(TARGETS eigen_blas
diff --git a/blas/common.h b/blas/common.h
index e7bfda570..8b9c6ff09 100644
--- a/blas/common.h
+++ b/blas/common.h
@@ -25,6 +25,8 @@
 #ifndef EIGEN_BLAS_COMMON_H
 #define EIGEN_BLAS_COMMON_H
 
+#include <iostream>
+
 #ifndef SCALAR
 #error the token SCALAR must be defined to compile this file
 #endif
@@ -34,13 +36,12 @@ extern "C"
 {
 #endif
 
-#include <blas.h>
+#include "../bench/btl/libs/C_BLAS/blas.h"
 
 #ifdef __cplusplus
 }
 #endif
 
-
 #define NOTR    0
 #define TR      1
 #define ADJ     2
@@ -75,27 +76,6 @@ extern "C"
 #include <Eigen/Jacobi>
 using namespace Eigen;
 
-template<typename T>
-Block<Map<Matrix<T,Dynamic,Dynamic> >, Dynamic, Dynamic>
-matrix(T* data, int rows, int cols, int stride)
-{
-  return Map<Matrix<T,Dynamic,Dynamic> >(data, stride, cols).block(0,0,rows,cols);
-}
-
-template<typename T>
-Block<Map<Matrix<T,Dynamic,Dynamic,RowMajor> >, Dynamic, 1>
-vector(T* data, int size, int incr)
-{
-  return Map<Matrix<T,Dynamic,Dynamic,RowMajor> >(data, size, incr).col(0);
-}
-
-template<typename T>
-Map<Matrix<T,Dynamic,1> >
-vector(T* data, int size)
-{
-  return Map<Matrix<T,Dynamic,1> >(data, size);
-}
-
 typedef SCALAR Scalar;
 typedef NumTraits<Scalar>::Real RealScalar;
 typedef std::complex<RealScalar> Complex;
@@ -106,10 +86,29 @@ enum
   Conj = IsComplex
 };
 
-typedef Block<Map<Matrix<Scalar,Dynamic,Dynamic> >, Dynamic, Dynamic> MatrixType;
-typedef Block<Map<Matrix<Scalar,Dynamic,Dynamic, RowMajor> >, Dynamic, 1> StridedVectorType;
+typedef Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<Dynamic> > MatrixType;
+typedef Map<Matrix<Scalar,Dynamic,1>, 0, InnerStride<Dynamic> > StridedVectorType;
 typedef Map<Matrix<Scalar,Dynamic,1> > CompactVectorType;
 
+template<typename T>
+Map<Matrix<T,Dynamic,Dynamic>, 0, OuterStride<Dynamic> >
+matrix(T* data, int rows, int cols, int stride)
+{
+  return Map<Matrix<T,Dynamic,Dynamic>, 0, OuterStride<Dynamic> >(data, rows, cols, OuterStride<Dynamic>(stride));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr)
+{
+  return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
+}
+
+template<typename T>
+Map<Matrix<T,Dynamic,1> > vector(T* data, int size)
+{
+  return Map<Matrix<T,Dynamic,1> >(data, size);
+}
+
 #define EIGEN_BLAS_FUNC(X) EIGEN_CAT(SCALAR_SUFFIX,X##_)
 
 #endif // EIGEN_BLAS_COMMON_H
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index c508626db..5326c6917 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -45,9 +45,9 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
   int size = IsComplex ? 2* *n : *n;
 
   if(*incx==1)
-    return vector(px,size).cwise().abs().sum();
+    return vector(px,size).cwiseAbs().sum();
   else
-    return vector(px,size,*incx).cwise().abs().sum();
+    return vector(px,size,*incx).cwiseAbs().sum();
 
   return 1;
 }
@@ -71,9 +71,9 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
   if(*incx==1 && *incy==1)
-    return (vector(x,*n).cwise()*vector(y,*n)).sum();
+    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
 
-  return (vector(x,*n,*incx).cwise()*vector(y,*n,*incy)).sum();
+  return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
 }
 
 /*
@@ -114,9 +114,9 @@ Scalar EIGEN_BLAS_FUNC(dotu)(int *n, RealScalar *px, int *incx, RealScalar *py,
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
   if(*incx==1 && *incy==1)
-    return (vector(x,*n).cwise()*vector(y,*n)).sum();
+    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
 
-  return (vector(x,*n,*incx).cwise()*vector(y,*n,*incy)).sum();
+  return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
 }
 
 #endif // ISCOMPLEX
@@ -215,9 +215,9 @@ RealScalar EIGEN_BLAS_FUNC(casum)(int *n, RealScalar *px, int *incx)
   Complex* x = reinterpret_cast<Complex*>(px);
 
   if(*incx==1)
-    return vector(x,*n).cwise().abs().sum();
+    return vector(x,*n).cwiseAbs().sum();
   else
-    return vector(x,*n,*incx).cwise().abs().sum();
+    return vector(x,*n,*incx).cwiseAbs().sum();
 
   return 1;
 }
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index d44de1b5d..76497ec26 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -26,8 +26,9 @@
 
 int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
+//   std::cerr << "in gemm " << *opa << " " << *opb << " " << *m << " " << *n << " " << *k << " " << *lda << " " << *ldb << " " << *ldc << " " << *palpha << " " << *pbeta << "\n";
   typedef void (*functype)(int, int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
-  functype func[12];
+  static functype func[12];
 
   static bool init = false;
   if(!init)
@@ -52,21 +53,29 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
   Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
   Scalar beta   = *reinterpret_cast<Scalar*>(pbeta);
 
-  if(beta!=Scalar(1))
-    matrix(c, *m, *n, *ldc) *= beta;
-
   int code = OP(*opa) | (OP(*opb) << 2);
-  if(code>=12 || func[code]==0)
+  if(code>=12 || func[code]==0 || (*m<0) || (*n<0) || (*k<0))
+  {
+    int info = 1;
+    xerbla_("GEMM", &info, 4);
     return 0;
+  }
+
+  if(beta!=Scalar(1))
+    if(beta==Scalar(0))
+      matrix(c, *m, *n, *ldc).setZero();
+    else
+      matrix(c, *m, *n, *ldc) *= beta;
 
   func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha);
-  return 1;
+  return 0;
 }
 
 int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
 {
+//   std::cerr << "in trsm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << "," << *n << " " << *palpha << " " << *lda << " " << *ldb<< "\n";
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int);
-  functype func[32];
+  static functype func[32];
 
   static bool init = false;
   if(!init)
@@ -74,38 +83,38 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
     for(int k=0; k<32; ++k)
       func[k] = 0;
 
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|0,          Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0,          false,ColMajor,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0,          false,RowMajor,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0,          Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|0,          Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0,          false,ColMajor,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0,          false,RowMajor,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0,          Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|0,          Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|0,          false,ColMajor,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0,          false,RowMajor,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|0,          Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0,          false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0,          false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|0,          Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|0,          false,ColMajor,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0,          false,RowMajor,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|0,          Conj, RowMajor,ColMajor>::run);
 
 
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, UpperTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,false,ColMajor,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,false,RowMajor,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,UpperTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,false,ColMajor,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,false,RowMajor,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, LowerTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Lower|UnitDiag,false,ColMajor,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,false,RowMajor,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheLeft, Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,false,ColMajor,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,false,RowMajor,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,LowerTriangular|UnitDiagBit,Conj, RowMajor,ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Lower|UnitDiag,false,ColMajor,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,false,RowMajor,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_triangular_solve_matrix<Scalar,OnTheRight,Upper|UnitDiag,Conj, RowMajor,ColMajor>::run);
 
     init = true;
   }
@@ -114,14 +123,23 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
   Scalar* b = reinterpret_cast<Scalar*>(pb);
   Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
 
-  // TODO handle alpha
-
   int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
-  if(code>=32 || func[code]==0)
+  if(code>=32 || func[code]==0 || *m<0 || *n <0)
+  {
+    int info=1;
+    xerbla_("TRSM",&info,4);
     return 0;
+  }
 
-  func[code](*m, *n, a, *lda, b, *ldb);
-  return 1;
+  if(SIDE(*side)==LEFT)
+    func[code](*m, *n, a, *lda, b, *ldb);
+  else
+    func[code](*n, *m, a, *lda, b, *ldb);
+
+  if(alpha!=Scalar(1))
+    matrix(b,*m,*n,*ldb) *= alpha;
+
+  return 0;
 }
 
 
@@ -129,46 +147,46 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
 // b = alpha*b*op(a)  for side = 'R'or'r'
 int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m, int *n, RealScalar *palpha,  RealScalar *pa, int *lda, RealScalar *pb, int *ldb)
 {
+//   std::cerr << "in trmm " << *side << " " << *uplo << " " << *opa << " " << *diag << " " << *m << " " << *n << " " << *lda << " " << *ldb << " " << *palpha << "\n";
   typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
-  functype func[32];
-
+  static functype func[32];
   static bool init = false;
   if(!init)
   {
     for(int k=0; k<32; ++k)
       func[k] = 0;
 
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (UP << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
 
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          true, ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          true, RowMajor,false,ColMajor,false,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|0,          false,ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          false,ColMajor,false,RowMajor,false,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (LO << 3) | (NUNIT << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|0,          false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
 
-    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,UpperTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (UP << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
 
-    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, RowMajor,false,ColMajor,false,ColMajor>::run);
-    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
+    func[NOTR  | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,true, ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, RowMajor,false,ColMajor,false,ColMajor>::run);
+    func[ADJ   | (LEFT  << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,true, RowMajor,Conj, ColMajor,false,ColMajor>::run);
 
-    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,ColMajor,false,ColMajor>::run);
-    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,false,ColMajor>::run);
-    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,LowerTriangular|UnitDiagBit,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
+    func[NOTR  | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Lower|UnitDiag,false,ColMajor,false,ColMajor,false,ColMajor>::run);
+    func[TR    | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,RowMajor,false,ColMajor>::run);
+    func[ADJ   | (RIGHT << 2) | (LO << 3) | (UNIT  << 4)] = (ei_product_triangular_matrix_matrix<Scalar,Upper|UnitDiag,false,ColMajor,false,RowMajor,Conj, ColMajor>::run);
 
     init = true;
   }
@@ -178,10 +196,21 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
   Scalar  alpha = *reinterpret_cast<Scalar*>(palpha);
 
   int code = OP(*opa) | (SIDE(*side) << 2) | (UPLO(*uplo) << 3) | (DIAG(*diag) << 4);
-  if(code>=32 || func[code]==0)
+  if(code>=32 || func[code]==0 || *m<0 || *n <0)
+  {
+    int info=1;
+    xerbla_("TRMM",&info,4);
     return 0;
+  }
 
-  func[code](*m, *n, a, *lda, b, *ldb, b, *ldb, alpha);
+  // FIXME find a way to avoid this copy
+  Matrix<Scalar,Dynamic,Dynamic> tmp = matrix(b,*m,*n,*ldb);
+  matrix(b,*m,*n,*ldb).setZero();
+
+  if(SIDE(*side)==LEFT)
+    func[code](*m, *n, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha);
+  else
+    func[code](*n, *m, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha);
   return 1;
 }
 
@@ -189,14 +218,26 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 // c = alpha*b*a + beta*c  for side = 'R'or'r
 int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
+//   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << " "
+//             << pa << " " << pb << " " << pc << "\n";
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
+  if(*m<0 || *n<0)
+  {
+    int info=1;
+    xerbla_("SYMM",&info,4);
+    return 0;
+  }
+
   if(beta!=Scalar(1))
-    matrix(c, *m, *n, *ldc) *= beta;
+    if(beta==Scalar(0))
+      matrix(c, *m, *n, *ldc).setZero();
+    else
+      matrix(c, *m, *n, *ldc) *= beta;
 
   if(SIDE(*side)==LEFT)
     if(UPLO(*uplo)==UP)
@@ -215,15 +256,16 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
   else
     return 0;
 
-  return 1;
+  return 0;
 }
 
 // c = alpha*a*a' + beta*c  for op = 'N'or'n'
 // c = alpha*a'*a + beta*c  for op = 'T'or't','C'or'c'
 int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
+//   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << "\n";
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar);
-  functype func[8];
+  static functype func[8];
 
   static bool init = false;
   if(!init)
@@ -231,13 +273,13 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
     for(int k=0; k<8; ++k)
       func[k] = 0;
 
-    func[NOTR  | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, UpperTriangular>::run);
-    func[TR    | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run);
-    func[ADJ   | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run);
+    func[NOTR  | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Upper>::run);
+    func[TR    | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
+    func[ADJ   | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
 
-    func[NOTR  | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, LowerTriangular>::run);
-    func[TR    | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run);
-    func[ADJ   | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run);
+    func[NOTR  | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Lower>::run);
+    func[TR    | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
+    func[ADJ   | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
 
     init = true;
   }
@@ -248,8 +290,12 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
   int code = OP(*op) | (UPLO(*uplo) << 2);
-  if(code>=8 || func[code]==0)
+  if(code>=8 || func[code]==0 || *n<0 || *k<0)
+  {
+    int info=1;
+    xerbla_("SYRK",&info,4);
     return 0;
+  }
 
   if(beta!=Scalar(1))
     matrix(c, *n, *n, *ldc) *= beta;
@@ -314,7 +360,7 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar);
-  functype func[8];
+  static functype func[8];
 
   static bool init = false;
   if(!init)
@@ -322,11 +368,11 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
     for(int k=0; k<8; ++k)
       func[k] = 0;
 
-    func[NOTR  | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, UpperTriangular>::run);
-    func[ADJ   | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,UpperTriangular>::run);
+    func[NOTR  | (UP << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Upper>::run);
+    func[ADJ   | (UP << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Upper>::run);
 
-    func[NOTR  | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, LowerTriangular>::run);
-    func[ADJ   | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,LowerTriangular>::run);
+    func[NOTR  | (LO << 2)] = (ei_selfadjoint_product<Scalar,ColMajor,ColMajor,true, Lower>::run);
+    func[ADJ   | (LO << 2)] = (ei_selfadjoint_product<Scalar,RowMajor,ColMajor,false,Lower>::run);
 
     init = true;
   }

From 3cc9e3f5bb22c3f20efc576b7b24302cc604a64d Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Mon, 1 Mar 2010 19:56:24 +0100
Subject: [PATCH 070/122] Fixes a compilation issue for MSVC.

---
 Eigen/src/Core/util/ForwardDeclarations.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index 8451d0ebe..aa01fdab2 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -41,8 +41,10 @@ template<typename ExpressionType> class NestByValue;
 template<typename ExpressionType> class ForceAlignedAccess;
 template<typename ExpressionType> class SwapWrapper;
 template<typename MatrixType> class Minor;
+// MSVC will not compile when the expression ei_traits<MatrixType>::Flags&DirectAccessBit
+// is put into brackets like (ei_traits<MatrixType>::Flags&DirectAccessBit)!
 template<typename MatrixType, int BlockRows=Dynamic, int BlockCols=Dynamic,
-         int _DirectAccessStatus = (ei_traits<MatrixType>::Flags&DirectAccessBit) ? HasDirectAccess : NoDirectAccess> class Block;
+         int _DirectAccessStatus = ei_traits<MatrixType>::Flags&DirectAccessBit ? HasDirectAccess : NoDirectAccess> class Block;
 template<typename MatrixType, int Size=Dynamic> class VectorBlock;
 template<typename MatrixType> class Transpose;
 template<typename MatrixType> class Conjugate;

From c7828ac45c151caad4afdbe6fc26ca989fef4337 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 17:38:40 +0100
Subject: [PATCH 071/122] add missing implementation of uniform scaling
 products

---
 Eigen/src/Geometry/Scaling.h | 27 +++++++++++++++++++++++++--
 test/geo_transformations.cpp | 11 +++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h
index 2ff8eaba3..27bbec8cd 100644
--- a/Eigen/src/Geometry/Scaling.h
+++ b/Eigen/src/Geometry/Scaling.h
@@ -72,8 +72,8 @@ public:
   inline Transform<Scalar,Dim> operator* (const Translation<Scalar,Dim>& t) const;
 
   /** Concatenates a uniform scaling and an affine transformation */
-  template<int Dim>
-  inline Transform<Scalar,Dim> operator* (const Transform<Scalar,Dim>& t) const;
+  template<int Dim, int Mode>
+  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim, Mode>& t) const;
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
@@ -156,4 +156,27 @@ typedef DiagonalMatrix<float, 3> AlignedScaling3f;
 typedef DiagonalMatrix<double,3> AlignedScaling3d;
 //@}
 
+template<typename Scalar>
+template<int Dim>
+inline Transform<Scalar,Dim>
+UniformScaling<Scalar>::operator* (const Translation<Scalar,Dim>& t) const
+{
+  Transform<Scalar,Dim> res;
+  res.matrix().setZero();
+  res.linear().diagonal().fill(factor());
+  res.translation() = factor() * t.vector();
+  res(Dim,Dim) = Scalar(1);
+  return res;
+}
+
+template<typename Scalar>
+template<int Dim,int Mode>
+inline Transform<Scalar,Dim,Mode>
+UniformScaling<Scalar>::operator* (const Transform<Scalar,Dim, Mode>& t) const
+{
+  Transform<Scalar,Dim> res = t;
+  res.prescale(factor());
+  return res;
+}
+
 #endif // EIGEN_SCALING_H
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index 895fe0f08..b1a50f6b2 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -59,6 +59,7 @@ template<typename Scalar, int Mode> void transformations(void)
   Matrix3 matrot1, m;
 
   Scalar a = ei_random<Scalar>(-Scalar(M_PI), Scalar(M_PI));
+  Scalar s0 = ei_random<Scalar>();
 
   VERIFY_IS_APPROX(v0, AngleAxisx(a, v0.normalized()) * v0);
   VERIFY_IS_APPROX(-v0, AngleAxisx(Scalar(M_PI), v0.unitOrthogonal()) * v0);
@@ -234,6 +235,16 @@ template<typename Scalar, int Mode> void transformations(void)
   t1 = Matrix3(q1) * (AlignedScaling3(v0) * Translation3(v0));
   VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
 
+
+  t0.setIdentity();
+  t0.scale(s0).translate(v0);
+  t1 = Scaling(s0) * Translation3(v0);
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+  t0.prescale(s0);
+  t1 = Scaling(s0) * t1;
+  VERIFY_IS_APPROX(t0.matrix(), t1.matrix());
+
+
   t0.setIdentity();
   t0.prerotate(q1).prescale(v0).pretranslate(v0);
   // translation * aligned scaling and transformation * mat

From 3efb3cc828b70e2b565fd100a2cd736922689136 Mon Sep 17 00:00:00 2001
From: Eamon Nerbonne <eamon@nerbonne.org>
Date: Tue, 2 Mar 2010 12:08:49 +0100
Subject: [PATCH 072/122] Changed product type selector to fix perf regression.

---
 Eigen/src/Core/Product.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 236e4f130..5577cb3bc 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -96,7 +96,7 @@ template<>                    struct ei_product_type_selector<Large, Small, 1>
 template<>                    struct ei_product_type_selector<1,    Large,Small>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Large,1,    Small>  { enum { ret = GemvProduct }; };
+template<>                    struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };

From afad108b5f893b9436e37e2405d5ca4b4da5d132 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Tue, 2 Mar 2010 19:36:21 +0100
Subject: [PATCH 073/122] Added a comment to prevent placing an
 EIGEN_STRONG_INLINE where it makes no sense.

---
 Eigen/src/Core/Product.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 5577cb3bc..33764e465 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -419,6 +419,9 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,false>
 /** \returns the matrix product of \c *this and \a other.
   *
   * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
+  * \note In MSVC, this function will not be inlined since ei_matrix_storage is an
+  *       unwindable object for dynamic matrices. Thus it does not help tagging
+  *       this function with EIGEN_STRONG_INLINE.
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */

From 3295c1c3e6e7cac184b677fc2016672593aff3d4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 23:18:13 +0100
Subject: [PATCH 074/122] product selector: the symmetric case

---
 Eigen/src/Core/Product.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 33764e465..04f89e743 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -68,9 +68,9 @@ template<typename Lhs, typename Rhs> struct ei_product_type
   // is to work around an internal compiler error with gcc 4.1 and 4.2.
 private:
   enum {
-    rows_select = Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1   ? 1 : Small),
-    cols_select = Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1   ? 1 : Small),
-    depth_select = Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1  ? 1 : Small)
+    rows_select   = Rows >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Rows==1   ? 1 : Small),
+    cols_select   = Cols >=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Cols==1   ? 1 : Small),
+    depth_select  = Depth>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ? Large : (Depth==1  ? 1 : Small)
   };
   typedef ei_product_type_selector<rows_select, cols_select, depth_select> product_type_selector;
 
@@ -93,7 +93,7 @@ template<>                    struct ei_product_type_selector<Small,Small,Small>
 template<>                    struct ei_product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<1,    Large,Small>  { enum { ret = GemvProduct }; };
+template<>                    struct ei_product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
 template<>                    struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
 template<>                    struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };

From 32823caa629a4f440a8a4af2c4ffb9f751d711ab Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Wed, 3 Mar 2010 07:52:19 +0100
Subject: [PATCH 075/122] Adapted the comment and removed it from the public
 dox.

---
 Eigen/src/Core/Product.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 04f89e743..865387b11 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -419,9 +419,6 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,false>
 /** \returns the matrix product of \c *this and \a other.
   *
   * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
-  * \note In MSVC, this function will not be inlined since ei_matrix_storage is an
-  *       unwindable object for dynamic matrices. Thus it does not help tagging
-  *       this function with EIGEN_STRONG_INLINE.
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */
@@ -430,6 +427,10 @@ template<typename OtherDerived>
 inline const typename ProductReturnType<Derived,OtherDerived>::Type
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
+  // A note regarding the function declaration: In MSVC, this function will sometimes
+  // not be inlined since ei_matrix_storage is an unwindable object for dynamic 
+  // matrices and product types are holding a member to store the result. 
+  // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
   enum {
     ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
                    || OtherDerived::RowsAtCompileTime==Dynamic

From f1d310195653e1d84f6b606354ea1c5babfa3d5d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Mar 2010 09:32:10 +0100
Subject: [PATCH 076/122] blas: add warnings for non implemented functions

---
 blas/level1_impl.h |  5 +++++
 blas/level2_impl.h | 33 +++++++++++++++++++++++----------
 blas/level3_impl.h |  2 ++
 blas/xerbla.cpp    |  1 +
 4 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index 08fd0b6d6..9f3e4d166 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -157,6 +157,9 @@ Scalar EIGEN_BLAS_FUNC(sdot)(int *n, RealScalar *px, int *incx, RealScalar *py,
 // computes a dot product of a conjugated vector with another vector.
 void EIGEN_BLAS_FUNC(dotc)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
+
+  std::cerr << "Eigen BLAS: _dotc is not implemented yet\n";
+
   return;
 
   // TODO: find how to return a complex to fortran
@@ -175,6 +178,8 @@ void EIGEN_BLAS_FUNC(dotc)(RealScalar* dot, int *n, RealScalar *px, int *incx, R
 // computes a vector-vector dot product without complex conjugation.
 void EIGEN_BLAS_FUNC(dotu)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
+  std::cerr << "Eigen BLAS: _dotu is not implemented yet\n";
+
   return;
 
   // TODO: find how to return a complex to fortran
diff --git a/blas/level2_impl.h b/blas/level2_impl.h
index 5691e8a7f..68be9a806 100644
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@@ -56,9 +56,11 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
   return 1;
 }
 
-/*
+
 int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
+  return 0;
+
   typedef void (*functype)(int, const Scalar *, int, Scalar *, int);
   functype func[16];
 
@@ -95,13 +97,14 @@ int EIGEN_BLAS_FUNC(trsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
     return 0;
 
   func[code](*n, a, *lda, b, *incb);
-  return 1;
+  return 0;
 }
-*/
 
-/*
+
+
 int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar *pa, int *lda, RealScalar *pb, int *incb)
 {
+  return 0;
   // TODO
 
   typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int);
@@ -140,13 +143,21 @@ int EIGEN_BLAS_FUNC(trmv)(char *uplo, char *opa, char *diag, int *n, RealScalar
     return 0;
 
   func[code](*n, a, *lda, b, *incb, b, *incb);
-  return 1;
+  return 0;
+}
+
+// y = alpha*A*x + beta*y
+int EIGEN_BLAS_FUNC(ssymv) (char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *px, int *incx, RealScalar *pbeta, RealScalar *py, int *incy)
+{
+  return 0;
+
+  // TODO
 }
-*/
 
-/*
 int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pc, int *ldc)
 {
+  return 0;
+
   // TODO
   typedef void (*functype)(int, const Scalar *, int, Scalar *, int, Scalar);
   functype func[2];
@@ -174,11 +185,13 @@ int EIGEN_BLAS_FUNC(syr)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa,
   func[code](*n, a, *inca, c, *ldc, alpha);
   return 1;
 }
-*/
 
-/*
+
+
 int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa, int *inca, RealScalar *pb, int *incb, RealScalar *pc, int *ldc)
 {
+  return 0;
+
   // TODO
   typedef void (*functype)(int, const Scalar *, int, const Scalar *, int, Scalar *, int, Scalar);
   functype func[2];
@@ -207,7 +220,7 @@ int EIGEN_BLAS_FUNC(syr2)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
   func[code](*n, a, *inca, b, *incb, c, *ldc, alpha);
   return 1;
 }
-*/
+
 
 #if ISCOMPLEX
 
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index c9023ab37..6a0e64392 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -308,6 +308,7 @@ int EIGEN_BLAS_FUNC(syr2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
   // TODO
+  std::cerr << "Eigen BLAS: _syr2k is not implemented yet\n";
 
   return 0;
 }
@@ -422,6 +423,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   }
 
   // TODO
+  std::cerr << "Eigen BLAS: _her2k is not implemented yet\n";
 
   return 0;
 }
diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
index ce43d3c38..2e7ad6eff 100644
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@@ -9,6 +9,7 @@ extern "C"
 int xerbla_(char * msg, int *info, int)
 {
   std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n";
+  return 0;
 }
 
 #ifdef __cplusplus

From ff6b94d6d089e3def456c4eceecde94bd17b9a5f Mon Sep 17 00:00:00 2001
From: Eamon Nerbonne <eamon@nerbonne.org>
Date: Tue, 2 Mar 2010 08:46:11 +0100
Subject: [PATCH 077/122] BenchTimer: avoid warning about symbol redefinition
 on win32, and include <Eigen/Core> (required to compile)

---
 bench/BenchTimer.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index e49afa07f..5e5945b57 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -27,8 +27,12 @@
 #define EIGEN_BENCH_TIMERR_H
 
 #if defined(_WIN32) || defined(__CYGWIN__)
+#ifndef NOMINMAX
 #define NOMINMAX
+#endif
+#ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #else
 #include <sys/time.h>
@@ -39,6 +43,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <numeric>
+#include <Eigen/Core>
 
 namespace Eigen
 {

From abfed301cb474c27fbb76a41cc459602db2b145f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 09:37:10 +0100
Subject: [PATCH 078/122] blas: fix SYRK

---
 blas/level3_impl.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 76497ec26..32b49b118 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -263,7 +263,7 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
 // c = alpha*a'*a + beta*c  for op = 'T'or't','C'or'c'
 int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
-//   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << "\n";
+//   std::cerr << "in syrk " << *uplo << " " << *op << " " << *n << " " << *k << " " << *palpha << " " << *lda << " " << *pbeta << " " << *ldc << "\n";
   typedef void (*functype)(int, int, const Scalar *, int, Scalar *, int, Scalar);
   static functype func[8];
 
@@ -298,10 +298,12 @@ int EIGEN_BLAS_FUNC(syrk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
   }
 
   if(beta!=Scalar(1))
-    matrix(c, *n, *n, *ldc) *= beta;
+    if(UPLO(*uplo)==UP) matrix(c, *n, *n, *ldc).triangularView<Upper>() *= beta;
+    else                matrix(c, *n, *n, *ldc).triangularView<Lower>() *= beta;
 
   func[code](*n, *k, a, *lda, c, *ldc, alpha);
-  return 1;
+
+  return 0;
 }
 
 // c = alpha*a*b' + alpha*b*a' + beta*c  for op = 'N'or'n'

From 7fd6458fec694f213323d6dd0718d315513adbb5 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 12:43:55 +0100
Subject: [PATCH 079/122] selfadjoint: do not reference the imaginary part of
 the diagonal

---
 .../Core/products/SelfadjointMatrixMatrix.h   | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 89cbc3ac0..84d056c5d 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -43,7 +43,10 @@ struct ei_symm_pack_lhs
     {
       for(int w=0; w<h; w++)
         blockA[count++] = ei_conj(lhs(k, i+w)); // transposed
-      for(int w=h; w<BlockRows; w++)
+
+      blockA[count++] = ei_real(lhs(k,k));   // real (diagonal)
+
+      for(int w=h+1; w<BlockRows; w++)
         blockA[count++] = lhs(i+w, k);          // normal
       ++h;
     }
@@ -71,8 +74,11 @@ struct ei_symm_pack_lhs
     // do the same with mr==1
     for(int i=peeled_mc; i<rows; i++)
     {
-      for(int k=0; k<=i; k++)
+      for(int k=0; k<i; k++)
         blockA[count++] = lhs(i, k);              // normal
+
+      blockA[count++] = ei_real(lhs(i, i));       // real (diagonal)
+
       for(int k=i+1; k<cols; k++)
         blockA[count++] = ei_conj(lhs(k, i));     // transposed
     }
@@ -129,8 +135,11 @@ struct ei_symm_pack_rhs
         // normal
         for (int w=0 ; w<h; ++w)
           ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*rhs(k,j2+w)));
+
+        ei_pstore(&blockB[count+h*PacketSize], ei_pset1(alpha*ei_real(rhs(k,k))));
+
         // transpose
-        for (int w=h ; w<nr; ++w)
+        for (int w=h+1 ; w<nr; ++w)
           ei_pstore(&blockB[count+w*PacketSize], ei_pset1(alpha*ei_conj(rhs(j2+w,k))));
         count += nr*PacketSize;
         ++h;
@@ -175,8 +184,15 @@ struct ei_symm_pack_rhs
         ei_pstore(&blockB[count], ei_pset1(alpha*ei_conj(rhs(j2,k))));
         count += PacketSize;
       }
+
+      if(half==j2)
+      {
+        ei_pstore(&blockB[count], ei_pset1(alpha*ei_real(rhs(j2,j2))));
+        count += PacketSize;
+      }
+
       // normal
-      for(int k=half; k<k2+rows; k++)
+      for(int k=half+1; k<k2+rows; k++)
       {
         ei_pstore(&blockB[count], ei_pset1(alpha*rhs(k,j2)));
         count += PacketSize;
@@ -385,7 +401,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
                                * RhsBlasTraits::extractScalarFactor(m_rhs);
 
     ei_product_selfadjoint_matrix<Scalar,
-      EIGEN_LOGICAL_XOR(LhsIsUpper, 
+      EIGEN_LOGICAL_XOR(LhsIsUpper,
                         ei_traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
       EIGEN_LOGICAL_XOR(RhsIsUpper,

From a2d7c239f54190ddb40febb6b4b65d74c261f008 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 12:44:40 +0100
Subject: [PATCH 080/122] blas: fix HEMM and HERK

---
 blas/complex_double.cpp |  2 +-
 blas/complex_single.cpp |  2 +-
 blas/level3_impl.h      | 90 ++++++++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/blas/complex_double.cpp b/blas/complex_double.cpp
index f51ccb25b..be2104a56 100644
--- a/blas/complex_double.cpp
+++ b/blas/complex_double.cpp
@@ -23,7 +23,7 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #define SCALAR        std::complex<double>
-#define SCALAR_SUFFIX c
+#define SCALAR_SUFFIX z
 #define ISCOMPLEX     1
 
 #include "level1_impl.h"
diff --git a/blas/complex_single.cpp b/blas/complex_single.cpp
index b6617e7b9..2b13bc7ce 100644
--- a/blas/complex_single.cpp
+++ b/blas/complex_single.cpp
@@ -23,7 +23,7 @@
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
 #define SCALAR        std::complex<float>
-#define SCALAR_SUFFIX z
+#define SCALAR_SUFFIX c
 #define ISCOMPLEX     1
 
 #include "level1_impl.h"
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 32b49b118..c9023ab37 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -218,8 +218,7 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
 // c = alpha*b*a + beta*c  for side = 'R'or'r
 int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *ldb, RealScalar *pbeta, RealScalar *pc, int *ldc)
 {
-//   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << " "
-//             << pa << " " << pb << " " << pc << "\n";
+//   std::cerr << "in symm " << *side << " " << *uplo << " " << *m << "x" << *n << " lda:" << *lda << " ldb:" << *ldb << " ldc:" << *ldc << " alpha:" << *palpha << " beta:" << *pbeta << "\n";
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* b = reinterpret_cast<Scalar*>(pb);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
@@ -234,25 +233,17 @@ int EIGEN_BLAS_FUNC(symm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
   }
 
   if(beta!=Scalar(1))
-    if(beta==Scalar(0))
-      matrix(c, *m, *n, *ldc).setZero();
-    else
-      matrix(c, *m, *n, *ldc) *= beta;
+    if(beta==Scalar(0)) matrix(c, *m, *n, *ldc).setZero();
+    else                matrix(c, *m, *n, *ldc) *= beta;
 
   if(SIDE(*side)==LEFT)
-    if(UPLO(*uplo)==UP)
-      ei_product_selfadjoint_matrix<Scalar, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else
-      return 0;
+    if(UPLO(*uplo)==UP)       ei_product_selfadjoint_matrix<Scalar, RowMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    else if(UPLO(*uplo)==LO)  ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    else                      return 0;
   else if(SIDE(*side)==RIGHT)
-    if(UPLO(*uplo)==UP)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else
-      return 0;
+    if(UPLO(*uplo)==UP)       ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    else if(UPLO(*uplo)==LO)  ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    else                      return 0;
   else
     return 0;
 
@@ -334,27 +325,30 @@ int EIGEN_BLAS_FUNC(hemm)(char *side, char *uplo, int *m, int *n, RealScalar *pa
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
+//   std::cerr << "in hemm " << *side << " " << *uplo << " " << *m << " " << *n << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
+
+  if(*m<0 || *n<0)
+  {
+    return 0;
+  }
+
   if(beta!=Scalar(1))
     matrix(c, *m, *n, *ldc) *= beta;
 
   if(SIDE(*side)==LEFT)
-    if(UPLO(*uplo)==UP)
-      ei_product_selfadjoint_matrix<Scalar, RowMajor,true,Conj,  ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
-    else
-      return 0;
+    if(UPLO(*uplo)==UP)       ei_product_selfadjoint_matrix<Scalar, RowMajor,true,Conj,  ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    else if(UPLO(*uplo)==LO)  ei_product_selfadjoint_matrix<Scalar, ColMajor,true,false, ColMajor,false,false, ColMajor>::run(*m, *n, a, *lda, b, *ldb, c, *ldc, alpha);
+    else                      return 0;
   else if(SIDE(*side)==RIGHT)
-    if(UPLO(*uplo)==UP)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,Conj,  ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else if(UPLO(*uplo)==LO)
-      ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
-    else
-      return 0;
+    if(UPLO(*uplo)==UP)       ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, RowMajor,true,Conj,  ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    else if(UPLO(*uplo)==LO)  ei_product_selfadjoint_matrix<Scalar, ColMajor,false,false, ColMajor,true,false, ColMajor>::run(*m, *n, b, *ldb, a, *lda, c, *ldc, alpha);
+    else                      return 0;
   else
+  {
     return 0;
+  }
 
-  return 1;
+  return 0;
 }
 
 // c = alpha*a*conj(a') + beta*c  for op = 'N'or'n'
@@ -381,18 +375,35 @@ int EIGEN_BLAS_FUNC(herk)(char *uplo, char *op, int *n, int *k, RealScalar *palp
 
   Scalar* a = reinterpret_cast<Scalar*>(pa);
   Scalar* c = reinterpret_cast<Scalar*>(pc);
-  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
+  RealScalar alpha = *palpha;
+  RealScalar beta  = *pbeta;
+
+//   std::cerr << "in herk " << *uplo << " " << *op << " " << *n << " " << *k << " " << alpha << " " << *lda << " " << beta << " " << *ldc << "\n";
+
+  if(*n<0 || *k<0)
+  {
+    return 0;
+  }
 
   int code = OP(*op) | (UPLO(*uplo) << 2);
   if(code>=8 || func[code]==0)
     return 0;
 
-  if(beta!=Scalar(1))
-    matrix(c, *n, *n, *ldc) *= beta;
+  if(beta!=RealScalar(1))
+  {
+    if(UPLO(*uplo)==UP) matrix(c, *n, *n, *ldc).triangularView<StrictlyUpper>() *= beta;
+    else                matrix(c, *n, *n, *ldc).triangularView<StrictlyLower>() *= beta;
 
-  func[code](*n, *k, a, *lda, c, *ldc, alpha);
-  return 1;
+    matrix(c, *n, *n, *ldc).diagonal().real() *= beta;
+    matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+  }
+
+  if(*k>0 && alpha!=RealScalar(0))
+  {
+    func[code](*n, *k, a, *lda, c, *ldc, alpha);
+    matrix(c, *n, *n, *ldc).diagonal().imag().setZero();
+  }
+  return 0;
 }
 
 // c = alpha*a*conj(b') + conj(alpha)*b*conj(a') + beta*c,  for op = 'N'or'n'
@@ -405,6 +416,11 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
   Scalar beta  = *reinterpret_cast<Scalar*>(pbeta);
 
+  if(*n<0 || *k<0)
+  {
+    return 0;
+  }
+
   // TODO
 
   return 0;

From bca04bd9836bf7c499dc08619dfd36aa120b1909 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Tue, 2 Mar 2010 08:41:35 -0500
Subject: [PATCH 081/122] fix compilation

---
 test/mixingtypes.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 71c2dcb18..8b8e8302e 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -78,7 +78,9 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
 
   // check dot product
   vf.dot(vf);
+#if 0 // we get other compilation errors here than just static asserts
   VERIFY_RAISES_ASSERT(vd.dot(vf));
+#endif
   VERIFY_RAISES_ASSERT(vcf.dot(vf)); // yeah eventually we should allow this but i'm too lazy to make that change now in Dot.h
                                      // especially as that might be rewritten as cwise product .sum() which would make that automatic.
 

From a76c296e7f56e912e265ee44e565c284cbdd011e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 14:45:43 +0100
Subject: [PATCH 082/122] blas: fix most of level1 functions

---
 blas/complex_double.cpp |   1 +
 blas/complex_single.cpp |   1 +
 blas/level1_impl.h      | 232 ++++++++++++++++++++++++++++++----------
 3 files changed, 175 insertions(+), 59 deletions(-)

diff --git a/blas/complex_double.cpp b/blas/complex_double.cpp
index be2104a56..fd6db03b5 100644
--- a/blas/complex_double.cpp
+++ b/blas/complex_double.cpp
@@ -24,6 +24,7 @@
 
 #define SCALAR        std::complex<double>
 #define SCALAR_SUFFIX z
+#define REAL_SCALAR_SUFFIX d
 #define ISCOMPLEX     1
 
 #include "level1_impl.h"
diff --git a/blas/complex_single.cpp b/blas/complex_single.cpp
index 2b13bc7ce..54c7ca1e5 100644
--- a/blas/complex_single.cpp
+++ b/blas/complex_single.cpp
@@ -24,6 +24,7 @@
 
 #define SCALAR        std::complex<float>
 #define SCALAR_SUFFIX c
+#define REAL_SCALAR_SUFFIX s
 #define ISCOMPLEX     1
 
 #include "level1_impl.h"
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index 5326c6917..fd680b819 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -30,52 +30,111 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx,
   Scalar* y = reinterpret_cast<Scalar*>(py);
   Scalar alpha  = *reinterpret_cast<Scalar*>(palpha);
 
-  if(*incx==1 && *incy==1)
-    vector(y,*n) += alpha * vector(x,*n);
-  else
-    vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
+//   std::cerr << "axpy " << *n << " " << alpha << " " << *incx << " " << *incy << "\n";
 
-  return 1;
+  if(*incx==1 && *incy==1)    vector(y,*n) += alpha * vector(x,*n);
+  else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
+  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx);
+  else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse();
+
+  return 0;
 }
 
+#if !ISCOMPLEX
 // computes the sum of magnitudes of all vector elements or, for a complex vector x, the sum
 // res = |Rex1| + |Imx1| + |Rex2| + |Imx2| + ... + |Rexn| + |Imxn|, where x is a vector of order n
 RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
 {
-  int size = IsComplex ? 2* *n : *n;
+//   std::cerr << "_asum " << *n << " " << *incx << "\n";
 
-  if(*incx==1)
-    return vector(px,size).cwiseAbs().sum();
-  else
-    return vector(px,size,*incx).cwiseAbs().sum();
+  Scalar* x = reinterpret_cast<Scalar*>(px);
 
-  return 1;
+  if(*n<=0) return 0;
+
+  if(*incx==1)  return vector(x,*n).cwiseAbs().sum();
+  else          return vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
 }
+#else
+
+struct ei_scalar_norm1_op {
+  typedef RealScalar result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(ei_scalar_norm1_op)
+  inline RealScalar operator() (const Scalar& a) const { return ei_norm1(a); }
+};
+namespace Eigen {
+template<> struct ei_functor_traits<ei_scalar_norm1_op >
+{
+  enum { Cost = 3 * NumTraits<Scalar>::AddCost, PacketAccess = 0 };
+};
+}
+
+RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "__asum " << *n << " " << *incx << "\n";
+
+  Complex* x = reinterpret_cast<Complex*>(px);
+
+  if(*n<=0) return 0;
+
+  if(*incx==1)  return vector(x,*n).unaryExpr<ei_scalar_norm1_op>().sum();
+  else          return vector(x,*n,std::abs(*incx)).unaryExpr<ei_scalar_norm1_op>().sum();
+}
+#endif
 
 int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
-  int size = IsComplex ? 2* *n : *n;
+//   std::cerr << "_copy " << *n << " " << *incx << " " << *incy << "\n";
 
-  if(*incx==1 && *incy==1)
-    vector(py,size) = vector(px,size);
-  else
-    vector(py,size,*incy) = vector(px,size,*incx);
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  return 1;
+  if(*incx==1 && *incy==1)    vector(y,*n) = vector(x,*n);
+  else if(*incx>0 && *incy>0) vector(y,*n,*incy) = vector(x,*n,*incx);
+  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() = vector(x,*n,*incx);
+  else if(*incx<0 && *incy>0) vector(y,*n,*incy) = vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() = vector(x,*n,-*incx).reverse();
+
+  return 0;
 }
 
 // computes a vector-vector dot product.
 Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
+//   std::cerr << "_dot " << *n << " " << *incx << " " << *incy << "\n";
+
+  if(*n<=0)
+    return 0;
+
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
-  if(*incx==1 && *incy==1)
-    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
-
-  return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
+  if(*incx==1 && *incy==1)    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
+  else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else return 0;
 }
 
+int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "i_amax " << *n << " " << *incx << "\n";
+
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  if(*n<=0)
+    return 0;
+
+  int ret;
+
+  if(*incx==1)  vector(x,*n).cwiseAbs().maxCoeff(&ret);
+  else          vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
+
+  return ret+1;
+}
+
+
 /*
 
 // computes a vector-vector dot product with extended precision.
@@ -96,53 +155,95 @@ Scalar EIGEN_BLAS_FUNC(sdot)(int *n, RealScalar *px, int *incx, RealScalar *py,
 #if ISCOMPLEX
 
 // computes a dot product of a conjugated vector with another vector.
-Scalar EIGEN_BLAS_FUNC(dotc)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
+void EIGEN_BLAS_FUNC(dotc)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
+  return;
+
+  // TODO: find how to return a complex to fortran
+
+//   std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
+
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
   if(*incx==1 && *incy==1)
-    return vector(x,*n).dot(vector(y,*n));
-
-  return vector(x,*n,*incx).dot(vector(y,*n,*incy));
+    *reinterpret_cast<Scalar*>(dot) = vector(x,*n).dot(vector(y,*n));
+  else
+    *reinterpret_cast<Scalar*>(dot) = vector(x,*n,*incx).dot(vector(y,*n,*incy));
 }
 
 // computes a vector-vector dot product without complex conjugation.
-Scalar EIGEN_BLAS_FUNC(dotu)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
+void EIGEN_BLAS_FUNC(dotu)(RealScalar* dot, int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
+  return;
+
+  // TODO: find how to return a complex to fortran
+
+//   std::cerr << "_dotu " << *n << " " << *incx << " " << *incy << "\n";
+
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
 
   if(*incx==1 && *incy==1)
-    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
-
-  return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
+    *reinterpret_cast<Scalar*>(dot) = (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
+  else
+    *reinterpret_cast<Scalar*>(dot) = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
 }
 
 #endif // ISCOMPLEX
 
+#if !ISCOMPLEX
 // computes the Euclidean norm of a vector.
 Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
 {
+//   std::cerr << "_nrm2 " << *n << " " << *incx << "\n";
   Scalar* x = reinterpret_cast<Scalar*>(px);
 
+  if(*n<=0)
+    return 0;
+
+  if(*incx==1)  return vector(x,*n).norm();
+  else          return vector(x,*n,std::abs(*incx)).norm();
+}
+#else
+RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n, RealScalar *px, int *incx)
+{
+//   std::cerr << "__nrm2 " << *n << " " << *incx << "\n";
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+
+  if(*n<=0)
+    return 0;
+
   if(*incx==1)
     return vector(x,*n).norm();
 
   return vector(x,*n,*incx).norm();
 }
+#endif
 
 int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
 {
+//   std::cerr << "_rot " << *n << " " << *incx << " " << *incy << "\n";
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
   Scalar c = *reinterpret_cast<Scalar*>(pc);
   Scalar s = *reinterpret_cast<Scalar*>(ps);
 
-  StridedVectorType vx(vector(x,*n,*incx));
-  StridedVectorType vy(vector(y,*n,*incy));
-  ei_apply_rotation_in_the_plane(vx, vy, PlanarRotation<Scalar>(c,s));
-  return 1;
+  if(*n<=0)
+    return 0;
+
+  StridedVectorType vx(vector(x,*n,std::abs(*incx)));
+  StridedVectorType vy(vector(y,*n,std::abs(*incy)));
+
+  Reverse<StridedVectorType> rvx(vx);
+  Reverse<StridedVectorType> rvy(vy);
+
+       if(*incx<0 && *incy>0) ei_apply_rotation_in_the_plane(rvx, vy, PlanarRotation<Scalar>(c,s));
+  else if(*incx>0 && *incy<0) ei_apply_rotation_in_the_plane(vx, rvy, PlanarRotation<Scalar>(c,s));
+  else                        ei_apply_rotation_in_the_plane(vx, vy,  PlanarRotation<Scalar>(c,s));
+
+
+  return 0;
 }
 
 int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealScalar *ps)
@@ -157,7 +258,7 @@ int EIGEN_BLAS_FUNC(rotg)(RealScalar *pa, RealScalar *pb, RealScalar *pc, RealSc
   *c = r.c();
   *s = r.s();
 
-  return 1;
+  return 0;
 }
 
 #if !ISCOMPLEX
@@ -183,43 +284,56 @@ int EIGEN_BLAS_FUNC(rotmg)(RealScalar *d1, RealScalar *d2, RealScalar *x1, RealS
 */
 #endif // !ISCOMPLEX
 
-int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *px, int *incx, RealScalar *palpha)
+int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
 {
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
 
-  if(*incx==1)
-    vector(x,*n) *= alpha;
+  std::cerr << "_scal " << *n << " " << alpha << " " << *incx << "\n";
 
-  vector(x,*n,*incx) *= alpha;
+  if(*n<=0)
+    return 0;
 
-  return 1;
+  if(*incx==1)  vector(x,*n) *= alpha;
+  else          vector(x,*n,std::abs(*incx)) *= alpha;
+
+  return 0;
 }
 
+#if ISCOMPLEX
+int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
+{
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  RealScalar alpha = *palpha;
+
+  std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
+
+  if(*n<=0)
+    return 0;
+
+  if(*incx==1)  vector(x,*n) *= alpha;
+  else          vector(x,*n,std::abs(*incx)) *= alpha;
+
+  return 0;
+}
+#endif // ISCOMPLEX
+
 int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
-  int size = IsComplex ? 2* *n : *n;
+  std::cerr << "_swap " << *n << " " << *incx << " " << *incy << "\n";
 
-  if(*incx==1 && *incy==1)
-    vector(py,size).swap(vector(px,size));
-  else
-    vector(py,size,*incy).swap(vector(px,size,*incx));
+  Scalar* x = reinterpret_cast<Scalar*>(px);
+  Scalar* y = reinterpret_cast<Scalar*>(py);
+
+  if(*n<=0)
+    return 0;
+
+  if(*incx==1 && *incy==1)    vector(y,*n).swap(vector(x,*n));
+  else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx));
+  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx));
+  else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse());
+  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse());
 
   return 1;
 }
 
-#if !ISCOMPLEX
-
-RealScalar EIGEN_BLAS_FUNC(casum)(int *n, RealScalar *px, int *incx)
-{
-  Complex* x = reinterpret_cast<Complex*>(px);
-
-  if(*incx==1)
-    return vector(x,*n).cwiseAbs().sum();
-  else
-    return vector(x,*n,*incx).cwiseAbs().sum();
-
-  return 1;
-}
-
-#endif // ISCOMPLEX

From 0ed5edd24dc72404dd9f2a998d7a6b742d29ec9d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 14:50:41 +0100
Subject: [PATCH 083/122] blas: add a default implementation of xerbla

---
 blas/CMakeLists.txt |  2 +-
 blas/level1_impl.h  |  6 +++---
 blas/xerbla.cpp     | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 blas/xerbla.cpp

diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index ee67fe519..e71076f9d 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -2,7 +2,7 @@ project(EigenBlas)
 
 add_custom_target(blas)
 
-set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp)
+set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp)
 
 add_library(eigen_blas ${EigenBlas_SRCS})
 # add_library(eigen_blas SHARED ${EigenBlas_SRCS})
diff --git a/blas/level1_impl.h b/blas/level1_impl.h
index fd680b819..08fd0b6d6 100644
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@@ -289,7 +289,7 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
 
-  std::cerr << "_scal " << *n << " " << alpha << " " << *incx << "\n";
+//   std::cerr << "_scal " << *n << " " << alpha << " " << *incx << "\n";
 
   if(*n<=0)
     return 0;
@@ -306,7 +306,7 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
   Scalar* x = reinterpret_cast<Scalar*>(px);
   RealScalar alpha = *palpha;
 
-  std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
+//   std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
 
   if(*n<=0)
     return 0;
@@ -320,7 +320,7 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
 
 int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy)
 {
-  std::cerr << "_swap " << *n << " " << *incx << " " << *incy << "\n";
+//   std::cerr << "_swap " << *n << " " << *incx << " " << *incy << "\n";
 
   Scalar* x = reinterpret_cast<Scalar*>(px);
   Scalar* y = reinterpret_cast<Scalar*>(py);
diff --git a/blas/xerbla.cpp b/blas/xerbla.cpp
new file mode 100644
index 000000000..ce43d3c38
--- /dev/null
+++ b/blas/xerbla.cpp
@@ -0,0 +1,16 @@
+
+#include <iostream>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+int xerbla_(char * msg, int *info, int)
+{
+  std::cerr << "Eigen BLAS ERROR #" << *info << ": " << msg << "\n";
+}
+
+#ifdef __cplusplus
+}
+#endif

From 2f3d685e0c687ae1121428dab6bc0ec868b14fe3 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 2 Mar 2010 15:31:39 +0100
Subject: [PATCH 084/122] a matrix (or array) does not always have the
 LinearAccessBit! => fixes in outerStride and matrix flags

---
 Eigen/src/Array/Array.h           | 3 ---
 Eigen/src/Core/DenseStorageBase.h | 9 ++++++++-
 Eigen/src/Core/Matrix.h           | 3 ---
 Eigen/src/Core/util/XprHelper.h   | 9 +++++++--
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Array/Array.h b/Eigen/src/Array/Array.h
index 91a091152..5a398d849 100644
--- a/Eigen/src/Array/Array.h
+++ b/Eigen/src/Array/Array.h
@@ -213,9 +213,6 @@ class Array
     void swap(ArrayBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
-    inline int innerStride() const { return 1; }
-    inline int outerStride() const { return this->innerSize(); }
-
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
     #endif
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index c7f903c7a..a0f3de542 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -139,6 +139,13 @@ class DenseStorageBase : public _Base<Derived>
     EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
+    inline int innerStride() const { return 1; }
+    inline int outerStride() const
+    {
+      static const int MaxInnerSize = Base::IsRowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime;
+      return (!IsVectorAtCompileTime) && MaxInnerSize!=Dynamic ? MaxInnerSize : this->innerSize();
+    }
+
     /** Resizes \c *this to a \a rows x \a cols matrix.
       *
       * This method is intended for dynamic-size matrices, although it is legal to call it on any
@@ -601,7 +608,7 @@ struct ei_conservative_resize_like_impl<Derived,OtherDerived,true>
     const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const int new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
     _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
-    
+
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
   }
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index e7422457c..3cd3f7814 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -318,9 +318,6 @@ class Matrix
     void swap(MatrixBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
-    inline int innerStride() const { return 1; }
-    inline int outerStride() const { return this->innerSize(); }
-
     /////////// Geometry module ///////////
 
     template<typename OtherDerived>
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a09475e20..fc4c01468 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -90,14 +90,19 @@ class ei_compute_matrix_flags
       inner_max_size = MaxCols==1 ? MaxRows
                      : MaxRows==1 ? MaxCols
                      : row_major_bit ? MaxCols : MaxRows,
+      inner_size = Cols==1 ? Rows
+                 : Rows==1 ? Cols
+                 : row_major_bit ? Cols : Rows,
       is_big = inner_max_size == Dynamic,
+      is_matrix = Cols!=1 && Rows!=1,
       is_packet_size_multiple = MaxRows==Dynamic || MaxCols==Dynamic || ((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0,
       aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
-      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
+      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0,
+      linear_access_bit = (inner_max_size!=Dynamic && inner_size!=inner_max_size && is_matrix) ? 0 : LinearAccessBit
     };
 
   public:
-    enum { ret = LinearAccessBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+    enum { ret = DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit | linear_access_bit };
 };
 
 template<int _Rows, int _Cols> struct ei_size_at_compile_time

From b0ffd9bf0414fa13aacb3edc0d467592c4a06848 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Mar 2010 09:41:29 +0100
Subject: [PATCH 085/122] clean #defined tokens, and use clock_gettime for the
 real time

---
 bench/BenchTimer.h | 49 ++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/bench/BenchTimer.h b/bench/BenchTimer.h
index 5e5945b57..0a0a5e154 100644
--- a/bench/BenchTimer.h
+++ b/bench/BenchTimer.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // Eigen is free software; you can redistribute it and/or
@@ -27,22 +27,19 @@
 #define EIGEN_BENCH_TIMERR_H
 
 #if defined(_WIN32) || defined(__CYGWIN__)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
+# ifndef NOMINMAX
+#   define NOMINMAX
+#   define EIGEN_BT_UNDEF_NOMINMAX
+# endif
+# ifndef WIN32_LEAN_AND_MEAN
+#   define WIN32_LEAN_AND_MEAN
+#   define EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
+# endif
+# include <windows.h>
 #else
-#include <sys/time.h>
-#include <time.h>
-#include <unistd.h>
+# include <unistd.h>
 #endif
 
-#include <cmath>
-#include <cstdlib>
-#include <numeric>
 #include <Eigen/Core>
 
 namespace Eigen
@@ -131,14 +128,13 @@ public:
   inline double getRealTime()
   {
 #ifdef WIN32
-	SYSTEMTIME st;
-	GetSystemTime(&st);
-	return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
+    SYSTEMTIME st;
+    GetSystemTime(&st);
+    return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
 #else
-    struct timeval tv;
-    struct timezone tz;
-    gettimeofday(&tv, &tz);
-    return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
+    timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
 #endif
   }
 
@@ -166,4 +162,15 @@ protected:
 
 }
 
+// clean #defined tokens
+#ifdef EIGEN_BT_UNDEF_NOMINMAX
+# undef EIGEN_BT_UNDEF_NOMINMAX
+# undef NOMINMAX
+#endif
+
+#ifdef EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
+# undef EIGEN_BT_UNDEF_WIN32_LEAN_AND_MEAN
+# undef WIN32_LEAN_AND_MEAN
+#endif
+
 #endif // EIGEN_BENCH_TIMERR_H

From aa6570c3a39fd0d8094b8aa32a2710b95eec6894 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Wed, 3 Mar 2010 15:24:58 +0100
Subject: [PATCH 086/122] Added a missing inline hints. Removed a useless
 Nested temporary.

---
 Eigen/src/LU/Determinant.h | 5 ++---
 Eigen/src/LU/Inverse.h     | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index fb6577f08..d0b70a31c 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -69,7 +69,7 @@ template<typename Derived> struct ei_determinant_impl<Derived, 2>
 
 template<typename Derived> struct ei_determinant_impl<Derived, 3>
 {
-  static typename ei_traits<Derived>::Scalar run(const Derived& m)
+  static inline typename ei_traits<Derived>::Scalar run(const Derived& m)
   {
     return ei_bruteforce_det3_helper(m,0,1,2)
           - ei_bruteforce_det3_helper(m,1,0,2)
@@ -100,8 +100,7 @@ inline typename ei_traits<Derived>::Scalar MatrixBase<Derived>::determinant() co
 {
   assert(rows() == cols());
   typedef typename ei_nested<Derived,Base::RowsAtCompileTime>::type Nested;
-  Nested nested(derived());
-  return ei_determinant_impl<typename ei_cleantype<Nested>::type>::run(nested);
+  return ei_determinant_impl<typename ei_cleantype<Nested>::type>::run(derived());
 }
 
 #endif // EIGEN_DETERMINANT_H
diff --git a/Eigen/src/LU/Inverse.h b/Eigen/src/LU/Inverse.h
index e20da70d6..116a614e1 100644
--- a/Eigen/src/LU/Inverse.h
+++ b/Eigen/src/LU/Inverse.h
@@ -123,7 +123,7 @@ struct ei_compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 ****************************/
 
 template<typename MatrixType, typename ResultType>
-void ei_compute_inverse_size3_helper(
+inline void ei_compute_inverse_size3_helper(
     const MatrixType& matrix,
     const typename ResultType::Scalar& invdet,
     const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,

From 6a92168915ed91c77abf24d641e263e3876157f3 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 3 Mar 2010 09:54:50 -0500
Subject: [PATCH 087/122] Backed out changeset
 2f3d685e0c687ae1121428dab6bc0ec868b14fe3

This was implementing deep changes that after discussion on the mailing list seem to need further discussion/thinking.
---
 Eigen/src/Array/Array.h           | 3 +++
 Eigen/src/Core/DenseStorageBase.h | 9 +--------
 Eigen/src/Core/Matrix.h           | 3 +++
 Eigen/src/Core/util/XprHelper.h   | 9 ++-------
 4 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Array/Array.h b/Eigen/src/Array/Array.h
index 5a398d849..91a091152 100644
--- a/Eigen/src/Array/Array.h
+++ b/Eigen/src/Array/Array.h
@@ -213,6 +213,9 @@ class Array
     void swap(ArrayBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
+    inline int innerStride() const { return 1; }
+    inline int outerStride() const { return this->innerSize(); }
+
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
     #endif
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index a0f3de542..c7f903c7a 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -139,13 +139,6 @@ class DenseStorageBase : public _Base<Derived>
     EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
-    inline int innerStride() const { return 1; }
-    inline int outerStride() const
-    {
-      static const int MaxInnerSize = Base::IsRowMajor ? MaxColsAtCompileTime : MaxRowsAtCompileTime;
-      return (!IsVectorAtCompileTime) && MaxInnerSize!=Dynamic ? MaxInnerSize : this->innerSize();
-    }
-
     /** Resizes \c *this to a \a rows x \a cols matrix.
       *
       * This method is intended for dynamic-size matrices, although it is legal to call it on any
@@ -608,7 +601,7 @@ struct ei_conservative_resize_like_impl<Derived,OtherDerived,true>
     const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const int new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
     _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
-
+    
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
   }
diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h
index 3cd3f7814..e7422457c 100644
--- a/Eigen/src/Core/Matrix.h
+++ b/Eigen/src/Core/Matrix.h
@@ -318,6 +318,9 @@ class Matrix
     void swap(MatrixBase<OtherDerived> EIGEN_REF_TO_TEMPORARY other)
     { this->_swap(other.derived()); }
 
+    inline int innerStride() const { return 1; }
+    inline int outerStride() const { return this->innerSize(); }
+
     /////////// Geometry module ///////////
 
     template<typename OtherDerived>
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index fc4c01468..a09475e20 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -90,19 +90,14 @@ class ei_compute_matrix_flags
       inner_max_size = MaxCols==1 ? MaxRows
                      : MaxRows==1 ? MaxCols
                      : row_major_bit ? MaxCols : MaxRows,
-      inner_size = Cols==1 ? Rows
-                 : Rows==1 ? Cols
-                 : row_major_bit ? Cols : Rows,
       is_big = inner_max_size == Dynamic,
-      is_matrix = Cols!=1 && Rows!=1,
       is_packet_size_multiple = MaxRows==Dynamic || MaxCols==Dynamic || ((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0,
       aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
-      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0,
-      linear_access_bit = (inner_max_size!=Dynamic && inner_size!=inner_max_size && is_matrix) ? 0 : LinearAccessBit
+      packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
     };
 
   public:
-    enum { ret = DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit | linear_access_bit };
+    enum { ret = LinearAccessBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
 };
 
 template<int _Rows, int _Cols> struct ei_size_at_compile_time

From 45d19afb18c0ac8d07de349dd80544f4b662210d Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 3 Mar 2010 09:58:43 -0500
Subject: [PATCH 088/122] cleanup/simplification in computation of matrix flags

---
 Eigen/src/Core/util/XprHelper.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index a09475e20..69c63e7bd 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -87,12 +87,13 @@ class ei_compute_matrix_flags
 {
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      inner_max_size = MaxCols==1 ? MaxRows
-                     : MaxRows==1 ? MaxCols
-                     : row_major_bit ? MaxCols : MaxRows,
-      is_big = inner_max_size == Dynamic,
-      is_packet_size_multiple = MaxRows==Dynamic || MaxCols==Dynamic || ((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0,
-      aligned_bit = (((Options&DontAlign)==0) && (is_big || is_packet_size_multiple)) ? AlignedBit : 0,
+      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+      is_fixed_size_aligned
+         = (!is_dynamic_size_storage)
+        && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
+      aligned_bit = (((Options&DontAlign)==0)
+        && (is_dynamic_size_storage || is_fixed_size_aligned))
+        ? AlignedBit : 0,
       packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
     };
 

From 112c550b4a3988f39e7d23e13c2f1bbd857bf55c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Wed, 3 Mar 2010 11:25:41 -0600
Subject: [PATCH 089/122] Added initial NEON support, most tests pass however
 we had to use some hackish workarounds as gcc on ARM (both CodeSourcery 4.4.1
 used and experimental 4.5) fail to ensure proper alignment with
 __attribute__((aligned(16))). This has to be fixed upstream to remove the
 workarounds.

---
 CMakeLists.txt                           |   7 +
 Eigen/Core                               |   8 +
 Eigen/src/Core/MatrixStorage.h           |   6 +
 Eigen/src/Core/arch/AltiVec/PacketMath.h |   5 +
 Eigen/src/Core/arch/CMakeLists.txt       |   3 +-
 Eigen/src/Core/arch/NEON/CMakeLists.txt  |   6 +
 Eigen/src/Core/arch/NEON/PacketMath.h    | 371 +++++++++++++++++++++++
 Eigen/src/Core/util/Macros.h             |   2 +-
 Eigen/src/Core/util/Memory.h             |   2 +-
 Eigen/src/Core/util/XprHelper.h          |   5 +
 cmake/EigenTesting.cmake                 |   8 +-
 test/packetmath.cpp                      |   5 +-
 12 files changed, 423 insertions(+), 5 deletions(-)
 create mode 100644 Eigen/src/Core/arch/NEON/CMakeLists.txt
 create mode 100644 Eigen/src/Core/arch/NEON/PacketMath.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eefaf4b47..7162e4457 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,13 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
     message("Enabling AltiVec in tests/examples")
   endif()
+
+  option(EIGEN_TEST_NEON "Enable/Disable altivec in tests/examples" OFF)
+  if(EIGEN_TEST_NEON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8")
+    message("Enabling NEON in tests/examples")
+  endif()
+
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
 if(MSVC)
diff --git a/Eigen/Core b/Eigen/Core
index f984a96c6..2908fef2e 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -111,6 +111,10 @@
     #undef bool
     #undef vector
     #undef pixel
+  #elif defined  __ARM_NEON__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include "arm_neon.h"
   #endif
 #endif
 
@@ -165,6 +169,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
   return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
   return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
 #else
   return "None";
 #endif
@@ -204,6 +210,8 @@ struct Dense {};
   #include "src/Core/arch/SSE/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_ALTIVEC
   #include "src/Core/arch/AltiVec/PacketMath.h"
+#elif defined EIGEN_VECTORIZE_NEON
+  #include "src/Core/arch/NEON/PacketMath.h"
 #endif
 
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index 3303b2663..ece603ffa 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -50,6 +50,12 @@ struct ei_matrix_array
   ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
 };
 
+// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly
+#ifdef __ARM_NEON__
+  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+  #endif
+#endif
 #ifdef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #else
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 1526a4b97..449de2078 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -169,6 +169,11 @@ template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ ei_assert(false && "packet integer division are not supported by AltiVec");
+  return ei_pset1<int>(0);
+}
+
 template<> inline v4f  ei_pmadd(const v4f&  a, const v4f&   b, const v4f&  c) { return vec_madd(a, b, c); }
 
 template<> inline v4f  ei_pmin(const v4f&   a, const v4f&   b) { return vec_min(a,b); }
diff --git a/Eigen/src/Core/arch/CMakeLists.txt b/Eigen/src/Core/arch/CMakeLists.txt
index 8ddba284e..5470ed8f3 100644
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,2 +1,3 @@
 ADD_SUBDIRECTORY(SSE)
-ADD_SUBDIRECTORY(AltiVec)
\ No newline at end of file
+ADD_SUBDIRECTORY(AltiVec)
+ADD_SUBDIRECTORY(NEON)
diff --git a/Eigen/src/Core/arch/NEON/CMakeLists.txt b/Eigen/src/Core/arch/NEON/CMakeLists.txt
new file mode 100644
index 000000000..fd4d4af50
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_NEON_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel
+)
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
new file mode 100644
index 000000000..9df485105
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -0,0 +1,371 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Heavily based on Gael's SSE version.
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_PACKET_MATH_NEON_H
+#define EIGEN_PACKET_MATH_NEON_H
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
+#endif
+
+typedef float32x4_t Packet4f;
+typedef int32x4_t   Packet4i;
+
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  const Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  const Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+
+template<> struct ei_packet_traits<float>  : ei_default_packet_traits
+{
+  typedef Packet4f type; enum {size=4};
+  enum {
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+template<> struct ei_packet_traits<int>    : ei_default_packet_traits
+{ typedef Packet4i type; enum {size=4}; };
+
+template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { return vdupq_n_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a)
+{
+  Packet4f countdown = { 3, 2, 1, 0 };
+  return vaddq_f32(ei_pset1(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)
+{
+  Packet4i countdown = { 3, 2, 1, 0 };
+  return vaddq_s32(ei_pset1(a), countdown);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return vnegq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return vnegq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f inv, restep, div;
+
+  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
+  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions 
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with 
+  // Newton-Raphson and vrecpsq_f32()
+  inv = vrecpeq_f32(b);
+
+  // This returns a differential, by which we will have to multiply inv to get a better
+  // approximation of 1/b. 
+  restep = vrecpsq_f32(b, inv);
+  inv = vmulq_f32(restep, inv);
+
+  // Finally, multiply a by 1/b and get the wanted result of the division.
+  div = vmulq_f32(a, inv);
+
+  return div;
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
+{ ei_assert(false && "packet integer division are not supported by NEON");
+  return ei_pset1<int>(0);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ 
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
+{ 
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { 
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r64, a_r128;
+  
+  a_r64 = vrev64q_f32(a);
+  a_lo = vget_low_f32(a_r64);
+  a_hi = vget_high_f32(a_r64);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return a_r128;
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) {
+  int32x2_t a_lo, a_hi;
+  Packet4i a_r64, a_r128;
+  
+  a_r64 = vrev64q_s32(a);
+  a_lo = vget_low_s32(a_r64);
+  a_hi = vget_high_s32(a_r64);
+  a_r128 = vcombine_s32(a_hi, a_lo);
+
+  return a_r128;
+}
+template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vabsq_f32(a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vabsq_s32(a); }
+
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, sum;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  sum = vpadd_f32(a_lo, a_hi);
+  sum = vpadd_f32(sum, sum);
+  vst1_f32(s, sum);
+  
+  return s[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
+{
+  float32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4f sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
+  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
+  sum = vaddq_f32(sum1, sum2);
+ 
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, sum;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  sum = vpadd_s32(a_lo, a_hi);
+  sum = vpadd_s32(sum, sum);
+  vst1_s32(s, sum);
+  
+  return s[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
+{
+  int32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4i sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
+  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
+  sum = vaddq_s32(sum1, sum2);
+ 
+  return sum;
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, prod;
+  float s[2];
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_f32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_f32(prod, vrev64_f32(prod));
+  vst1_f32(s, prod);
+  
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, prod;
+  int32_t s[2];
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_s32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_s32(prod, vrev64_s32(prod));
+  vst1_s32(s, prod);
+  
+  return s[0];
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, min;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  min = vpmin_f32(a_lo, a_hi);
+  min = vpmin_f32(min, min);
+  vst1_f32(s, min);
+
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, min;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  min = vpmin_s32(a_lo, a_hi);
+  min = vpmin_s32(min, min);
+  vst1_s32(s, min);
+  
+  return s[0];
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
+{
+  float32x2_t a_lo, a_hi, max;
+  float s[2];
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  max = vpmax_f32(a_lo, a_hi);
+  max = vpmax_f32(max, max);
+  vst1_f32(s, max);
+
+  return s[0];
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
+{
+  int32x2_t a_lo, a_hi, max;
+  int32_t s[2];
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);
+  vst1_s32(s, max);
+  
+  return s[0];
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet4f>
+{
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset!=0)
+      first = vextq_f32(first, second, Offset);
+  }
+};
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet4i>
+{
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset!=0)
+      first = vextq_s32(first, second, Offset);
+  }
+};
+#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 37ccef047..7970b3bb0 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -39,7 +39,7 @@
 // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable 16 byte alignment on all
 // platforms where vectorization might be enabled. In theory we could always enable alignment, but it can be a cause of problems
 // on some platforms, so we just disable it in certain common platform (compiler+architecture combinations) to avoid these problems.
-#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
+#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__) || defined(__ARM_NEON__))
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 1
 #else
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 0
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index fbb1ef4d6..4d037b998 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -424,7 +424,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   * ei_aligned_stack_free(data,float,array.size());
   * \endcode
   */
-#ifdef __linux__
+#if (defined __linux__) && !(defined __ARM_NEON__)
   #define ei_aligned_stack_alloc(SIZE) (SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) \
                                     ? alloca(SIZE) \
                                     : ei_aligned_malloc(SIZE)
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 69c63e7bd..67665d91d 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -88,8 +88,13 @@ class ei_compute_matrix_flags
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
+#if !defined(__ARM_NEON__)
       is_fixed_size_aligned
          = (!is_dynamic_size_storage)
+#else
+// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly
+      is_fixed_size_aligned = 0
+#endif
         && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
       aligned_bit = (((Options&DontAlign)==0)
         && (is_dynamic_size_storage || is_fixed_size_aligned))
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 7d90882a2..b08f8c340 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -185,11 +185,17 @@ macro(ei_testing_print_summary)
   endif()
 
   if(EIGEN_TEST_ALTIVEC)
-    message("Altivec:           Using architecture defaults")
+    message("Altivec:           ON")
   else()
     message("Altivec:           Using architecture defaults")
   endif()
 
+  if(EIGEN_TEST_NEON)
+    message("ARM NEON:          ON")
+  else()
+    message("ARM NEON:          Using architecture defaults")
+  endif()
+
   if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
     message("Explicit vec:      OFF")
   else()
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 7d863e616..e0cb61525 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -32,7 +32,10 @@ template<typename T> T ei_negate(const T& x) { return -x; }
 template<typename Scalar> bool areApprox(const Scalar* a, const Scalar* b, int size)
 {
   for (int i=0; i<size; ++i)
-    if (!ei_isApprox(a[i],b[i])) return false;
+    if (!ei_isApprox(a[i],b[i])) {
+	std::cout << "a[" << i << "]: " << a[i] << ", b[" << i << "]: " << b[i] << std::endl;
+	return false;
+    }
   return true;
 }
 

From 7dd81aad74462ba5d3e51e2efdd27e7469b16b90 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 3 Mar 2010 18:47:58 +0100
Subject: [PATCH 090/122] factorize default performance related settings to a
 single file included after the architecture specific files such that they can
 be adapted by each platform.

---
 Eigen/Core                             |  4 +-
 Eigen/src/Core/DenseStorageBase.h      |  2 +-
 Eigen/src/Core/arch/Default/Settings.h | 65 ++++++++++++++++++++++++++
 Eigen/src/Core/util/Macros.h           | 24 ----------
 test/lu.cpp                            | 14 +++---
 5 files changed, 74 insertions(+), 35 deletions(-)
 create mode 100644 Eigen/src/Core/arch/Default/Settings.h

diff --git a/Eigen/Core b/Eigen/Core
index 2908fef2e..01456fe7e 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -214,9 +214,7 @@ struct Dense {};
   #include "src/Core/arch/NEON/PacketMath.h"
 #endif
 
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
+#include "src/Core/arch/Default/Settings.h"
 
 #include "src/Core/Functors.h"
 #include "src/Core/DenseBase.h"
diff --git a/Eigen/src/Core/DenseStorageBase.h b/Eigen/src/Core/DenseStorageBase.h
index c7f903c7a..dac2142a4 100644
--- a/Eigen/src/Core/DenseStorageBase.h
+++ b/Eigen/src/Core/DenseStorageBase.h
@@ -601,7 +601,7 @@ struct ei_conservative_resize_like_impl<Derived,OtherDerived,true>
     const int new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const int new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
     _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
-    
+
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
   }
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
new file mode 100644
index 000000000..1e7cebdba
--- /dev/null
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+
+/* All the parameters defined in this file can be specialized in the
+ * architecture specific files, and/or by the user.
+ * More to come... */
+
+#ifndef EIGEN_DEFAULT_SETTINGS_H
+#define EIGEN_DEFAULT_SETTINGS_H
+
+/** Defines the maximal loop size to enable meta unrolling of loops.
+  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+  * it does not correspond to the number of iterations or the number of instructions
+  */
+#ifndef EIGEN_UNROLLING_LIMIT
+#define EIGEN_UNROLLING_LIMIT 100
+#endif
+
+/** Defines the threshold between a "small" and a "large" matrix.
+  * This threshold is mainly used to select the proper product implementation.
+  */
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
+  * The current value is set to generate blocks of 256x256 for float
+  *
+  * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
+  */
+#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
+#endif
+
+/** Defines the maximal width of the blocks used in the triangular product and solver
+  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+  */
+#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
+#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
+#endif
+
+#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7970b3bb0..19420f610 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -78,30 +78,6 @@
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
 #endif
 
-/** Defines the maximal loop size to enable meta unrolling of loops.
-  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
-  * it does not correspond to the number of iterations or the number of instructions
-  */
-#ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
-#endif
-
-/** Defines the maximal size in Bytes of blocks fitting in CPU cache.
-  * The current value is set to generate blocks of 256x256 for float
-  *
-  * Typically for a single-threaded application you would set that to 25% of the size of your CPU caches in bytes
-  */
-#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
-#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (sizeof(float)*256*256)
-#endif
-
-/** Defines the maximal width of the blocks used in the triangular product and solver
-  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
-  */
-#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
-#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
-#endif
-
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
diff --git a/test/lu.cpp b/test/lu.cpp
index 1ed38cb2b..37e2990d2 100644
--- a/test/lu.cpp
+++ b/test/lu.cpp
@@ -67,7 +67,7 @@ template<typename MatrixType> void lu_non_invertible()
 
   // The image of the zero matrix should consist of a single (zero) column vector
   VERIFY((MatrixType::Zero(rows,cols).fullPivLu().image(MatrixType::Zero(rows,cols)).cols() == 1));
-  
+
   MatrixType m1(rows, cols), m3(rows, cols2);
   CMatrixType m2(cols, cols2);
   createRandomPIMatrixOfRank(rank, rows, cols, m1);
@@ -85,9 +85,9 @@ template<typename MatrixType> void lu_non_invertible()
   RMatrixType l = RMatrixType::Identity(rows,rows);
   l.block(0,0,rows,std::min(rows,cols)).template triangularView<StrictlyLower>()
     = lu.matrixLU().block(0,0,rows,std::min(rows,cols));
-  
+
   VERIFY_IS_APPROX(lu.permutationP() * m1 * lu.permutationQ(), l*u);
-  
+
   KernelMatrixType m1kernel = lu.kernel();
   ImageMatrixType m1image = lu.image(m1);
 
@@ -192,20 +192,20 @@ void test_lu()
 
     CALL_SUBTEST_2( (lu_non_invertible<Matrix<double, 4, 6> >()) );
     CALL_SUBTEST_2( (lu_verify_assert<Matrix<double, 4, 6> >()) );
-    
+
     CALL_SUBTEST_3( lu_non_invertible<MatrixXf>() );
     CALL_SUBTEST_3( lu_invertible<MatrixXf>() );
     CALL_SUBTEST_3( lu_verify_assert<MatrixXf>() );
-    
+
     CALL_SUBTEST_4( lu_non_invertible<MatrixXd>() );
     CALL_SUBTEST_4( lu_invertible<MatrixXd>() );
     CALL_SUBTEST_4( lu_partial_piv<MatrixXd>() );
     CALL_SUBTEST_4( lu_verify_assert<MatrixXd>() );
-    
+
     CALL_SUBTEST_5( lu_non_invertible<MatrixXcf>() );
     CALL_SUBTEST_5( lu_invertible<MatrixXcf>() );
     CALL_SUBTEST_5( lu_verify_assert<MatrixXcf>() );
-    
+
     CALL_SUBTEST_6( lu_non_invertible<MatrixXcd>() );
     CALL_SUBTEST_6( lu_invertible<MatrixXcd>() );
     CALL_SUBTEST_6( lu_partial_piv<MatrixXcd>() );

From 6c89fd4df0137823cb1ce5cf9ac94d909a54dd6c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 3 Mar 2010 13:16:21 -0500
Subject: [PATCH 091/122] minor cleanup

---
 CMakeLists.txt                  | 4 ++--
 Eigen/src/Core/util/XprHelper.h | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7162e4457..4016de370 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,13 +98,13 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     message("Enabling SSE4.2 in tests/examples")
   endif()
 
-  option(EIGEN_TEST_ALTIVEC "Enable/Disable altivec in tests/examples" OFF)
+  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
   if(EIGEN_TEST_ALTIVEC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
     message("Enabling AltiVec in tests/examples")
   endif()
 
-  option(EIGEN_TEST_NEON "Enable/Disable altivec in tests/examples" OFF)
+  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8")
     message("Enabling NEON in tests/examples")
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 67665d91d..eff055b04 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -90,12 +90,11 @@ class ei_compute_matrix_flags
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
 #if !defined(__ARM_NEON__)
       is_fixed_size_aligned
-         = (!is_dynamic_size_storage)
+        = (!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
 #else
 // FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly
-      is_fixed_size_aligned = 0
+      is_fixed_size_aligned = 0,
 #endif
-        && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
       aligned_bit = (((Options&DontAlign)==0)
         && (is_dynamic_size_storage || is_fixed_size_aligned))
         ? AlignedBit : 0,

From 710bc073a739673308374925826bbf2bf6f54daf Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Wed, 3 Mar 2010 12:15:34 -0600
Subject: [PATCH 092/122] arm_neon.h is a standard header file, fixed

---
 Eigen/Core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/Core b/Eigen/Core
index 01456fe7e..56849116c 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -114,7 +114,7 @@
   #elif defined  __ARM_NEON__
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
-    #include "arm_neon.h"
+    #include <arm_neon.h>
   #endif
 #endif
 

From 68d94d914ed7023e998e392b8a127966d05b5af8 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Thu, 4 Mar 2010 09:03:06 -0500
Subject: [PATCH 093/122] integer division is vectorizable on no SIMD platform,
 not just SSE.

---
 Eigen/src/Core/Functors.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index 31d0cff70..c2b317cc0 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -179,7 +179,7 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
   enum {
     Cost = 2 * NumTraits<Scalar>::MulCost,
     PacketAccess = ei_packet_traits<Scalar>::size>1
-                  #if (defined EIGEN_VECTORIZE_SSE)
+                  #if (defined EIGEN_VECTORIZE)
                   && NumTraits<Scalar>::HasFloatingPoint
                   #endif
   };

From 8ed1ef446998dc35f738ad9984cf479dbfc2cc6c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 4 Mar 2010 18:30:28 +0100
Subject: [PATCH 094/122] add a minor FIXME

---
 Eigen/src/Core/arch/NEON/PacketMath.h | 33 ++++++++++++++-------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9df485105..f71b92a75 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -96,13 +96,13 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, con
 
   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
   // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions 
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with 
+  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
+  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
   // Newton-Raphson and vrecpsq_f32()
   inv = vrecpeq_f32(b);
 
   // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b. 
+  // approximation of 1/b.
   restep = vrecpsq_f32(b, inv);
   inv = vmulq_f32(restep, inv);
 
@@ -139,13 +139,13 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, cons
 template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ 
+{
   return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ 
+{
   return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
@@ -162,13 +162,14 @@ template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 
+// FIXME only store the 2 first elements ?
 template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
 template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { 
+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) {
   float32x2_t a_lo, a_hi;
   Packet4f a_r64, a_r128;
-  
+
   a_r64 = vrev64q_f32(a);
   a_lo = vget_low_f32(a_r64);
   a_hi = vget_high_f32(a_r64);
@@ -179,7 +180,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) {
 template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) {
   int32x2_t a_lo, a_hi;
   Packet4i a_r64, a_r128;
-  
+
   a_r64 = vrev64q_s32(a);
   a_lo = vget_low_s32(a_r64);
   a_hi = vget_high_s32(a_r64);
@@ -200,7 +201,7 @@ template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
   sum = vpadd_f32(a_lo, a_hi);
   sum = vpadd_f32(sum, sum);
   vst1_f32(s, sum);
-  
+
   return s[0];
 }
 
@@ -220,7 +221,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vec
   sum1 = vaddq_f32(res1.val[0], res1.val[1]);
   sum2 = vaddq_f32(res2.val[0], res2.val[1]);
   sum = vaddq_f32(sum1, sum2);
- 
+
   return sum;
 }
 
@@ -234,7 +235,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
   sum = vpadd_s32(a_lo, a_hi);
   sum = vpadd_s32(sum, sum);
   vst1_s32(s, sum);
-  
+
   return s[0];
 }
 
@@ -254,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vec
   sum1 = vaddq_s32(res1.val[0], res1.val[1]);
   sum2 = vaddq_s32(res2.val[0], res2.val[1]);
   sum = vaddq_s32(sum1, sum2);
- 
+
   return sum;
 }
 
@@ -273,7 +274,7 @@ template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
   // Multiply prod with its swapped value |a2*a4|a1*a3|
   prod = vmul_f32(prod, vrev64_f32(prod));
   vst1_f32(s, prod);
-  
+
   return s[0];
 }
 template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
@@ -289,7 +290,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
   // Multiply prod with its swapped value |a2*a4|a1*a3|
   prod = vmul_s32(prod, vrev64_s32(prod));
   vst1_s32(s, prod);
-  
+
   return s[0];
 }
 
@@ -317,7 +318,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
   min = vpmin_s32(a_lo, a_hi);
   min = vpmin_s32(min, min);
   vst1_s32(s, min);
-  
+
   return s[0];
 }
 
@@ -345,7 +346,7 @@ template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
   max = vpmax_s32(a_lo, a_hi);
   max = vpmax_s32(max, max);
   vst1_s32(s, max);
-  
+
   return s[0];
 }
 

From 17230686944e63cca8d9ce1c981d6e869fcbcb62 Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Thu, 4 Mar 2010 18:33:51 +0100
Subject: [PATCH 095/122] Moved x()/y()/z() and w() access functions to
 DenseBase; they are now available for Arrays as well.

---
 Eigen/src/Core/Coeffs.h     | 16 ++++++++--------
 Eigen/src/Core/DenseBase.h  |  9 +++++++++
 Eigen/src/Core/MatrixBase.h |  9 ---------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/Eigen/src/Core/Coeffs.h b/Eigen/src/Core/Coeffs.h
index da7b9153f..727dea75e 100644
--- a/Eigen/src/Core/Coeffs.h
+++ b/Eigen/src/Core/Coeffs.h
@@ -239,42 +239,42 @@ EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
 
 /** equivalent to operator[](0).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
   ::x() const { return (*this)[0]; }
 
 /** equivalent to operator[](1).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
   ::y() const { return (*this)[1]; }
 
 /** equivalent to operator[](2).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
   ::z() const { return (*this)[2]; }
 
 /** equivalent to operator[](3).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::CoeffReturnType MatrixBase<Derived>
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::CoeffReturnType DenseBase<Derived>
   ::w() const { return (*this)[3]; }
 
 /** equivalent to operator[](0).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
   ::x() { return (*this)[0]; }
 
 /** equivalent to operator[](1).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
   ::y() { return (*this)[1]; }
 
 /** equivalent to operator[](2).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
   ::z() { return (*this)[2]; }
 
 /** equivalent to operator[](3).  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& MatrixBase<Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar& DenseBase<Derived>
   ::w() { return (*this)[3]; }
 
 /** \returns the packet of coefficients starting at the given row and column. It is your responsibility
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 67540bd8c..52a883811 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -280,6 +280,15 @@ template<typename Derived> class DenseBase
     typedef Block<Derived, 1, ei_traits<Derived>::ColsAtCompileTime> RowXpr;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
+    const CoeffReturnType x() const;
+    const CoeffReturnType y() const;
+    const CoeffReturnType z() const;
+    const CoeffReturnType w() const;
+    Scalar& x();
+    Scalar& y();
+    Scalar& z();
+    Scalar& w();
+
     /** Copies \a other into *this. \returns a reference to *this. */
     template<typename OtherDerived>
     Derived& operator=(const DenseBase<OtherDerived>& other);
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 9c62163ba..ac79de66d 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -169,15 +169,6 @@ template<typename Derived> class MatrixBase
     Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-    const CoeffReturnType x() const;
-    const CoeffReturnType y() const;
-    const CoeffReturnType z() const;
-    const CoeffReturnType w() const;
-    Scalar& x();
-    Scalar& y();
-    Scalar& z();
-    Scalar& w();
-
     template<typename OtherDerived>
     Derived& operator+=(const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>

From ea8cad51518cfae6eb7406268aef6c28ff62389f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 4 Mar 2010 18:58:12 +0100
Subject: [PATCH 096/122] make the number of registers easier to configure per
 architectures

---
 Eigen/src/Core/arch/Default/Settings.h | 13 +++++++++++++
 Eigen/src/Core/arch/NEON/PacketMath.h  |  6 ++++++
 Eigen/src/Core/util/BlasUtil.h         |  8 ++------
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index 1e7cebdba..1ab2877b6 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -62,4 +62,17 @@
 #define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
 #endif
 
+
+/** Defines the default number of registers available for that architecture.
+  * Currently it must be 8 or 16. Other values will fail.
+  */
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if (defined __i386__)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+
+#endif
+
 #endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index f71b92a75..2acb3633a 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -35,6 +35,12 @@
 #define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 4*96*96
 #endif
 
+// FIXME NEON has 16 quad registers, but since the current register allocator
+// is so bad, it is much better to reduce it to 8
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#endif
+
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 4d216d77a..95ff446c7 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -130,14 +130,10 @@ struct ei_product_blocking_traits
   typedef typename ei_packet_traits<Scalar>::type PacketType;
   enum {
     PacketSize = sizeof(PacketType)/sizeof(Scalar),
-    #if (defined __i386__)
-    HalfRegisterCount = 4,
-    #else
-    HalfRegisterCount = 8,
-    #endif
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
     // register block size along the N direction (must be either 2 or 4)
-    nr = HalfRegisterCount/2,
+    nr = NumberOfRegisters/4,
 
     // register block size along the M direction (currently, this one cannot be modified)
     mr = 2 * PacketSize,

From 620bd2848034a82dc3d63c972f20fcd4e96831e1 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 09:44:21 +0100
Subject: [PATCH 097/122] enable posix_memalign for QNX

---
 Eigen/src/Core/util/Memory.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 4d037b998..aa0073d44 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -66,7 +66,7 @@
   #define EIGEN_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if ((defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \
+#if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) \
  && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
   #define EIGEN_HAS_POSIX_MEMALIGN 1
 #else
@@ -103,8 +103,8 @@ inline void ei_handmade_aligned_free(void *ptr)
   if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
 }
 
-/** \internal 
-  * \brief Reallocates aligned memory. 
+/** \internal
+  * \brief Reallocates aligned memory.
   * Since we know that our handmade version is based on std::realloc
   * we can use std::realloc to implement efficient reallocation.
   */
@@ -126,10 +126,10 @@ inline void* ei_handmade_aligned_realloc(void* ptr, size_t size, size_t = 0)
 void* ei_aligned_malloc(size_t size);
 void  ei_aligned_free(void *ptr);
 
-/** \internal 
+/** \internal
   * \brief Reallocates aligned memory.
   * Allows reallocation with aligned ptr types. This implementation will
-  * always create a new memory chunk and copy the old data. 
+  * always create a new memory chunk and copy the old data.
   */
 inline void* ei_generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
 {
@@ -143,13 +143,13 @@ inline void* ei_generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
   }
 
   void* newptr = ei_aligned_malloc(size);
-  if (newptr == 0) 
-  { 
+  if (newptr == 0)
+  {
     errno = ENOMEM; // according to the standard
     return 0;
   }
 
-  if (ptr != 0) 
+  if (ptr != 0)
   {
     std::memcpy(newptr, ptr, std::min(size,old_size));
     ei_aligned_free(ptr);
@@ -212,7 +212,7 @@ inline void ei_aligned_free(void *ptr)
 }
 
 /**
-* \internal 
+* \internal
 * \brief Reallocates an aligned block of memory.
 * \throws std::bad_alloc if EIGEN_EXCEPTIONS are defined.
 **/
@@ -413,10 +413,10 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 *****************************************************************************/
 
 /** \internal
-  * Allocates an aligned buffer of SIZE bytes on the stack if SIZE is smaller than 
-  * EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform 
+  * Allocates an aligned buffer of SIZE bytes on the stack if SIZE is smaller than
+  * EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
   * (currently, this is Linux only). Otherwise the memory is allocated on the heap.
-  * Data allocated with ei_aligned_stack_alloc \b must be freed by calling 
+  * Data allocated with ei_aligned_stack_alloc \b must be freed by calling
   * ei_aligned_stack_free(PTR,SIZE).
   * \code
   * float * data = ei_aligned_stack_alloc(float,array.size());

From 24ef5fedcda79246b337780bc5da63be188e2a75 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 09:57:04 +0100
Subject: [PATCH 098/122] minor cleaning

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 30 +------------------
 bench/bench_gemm.cpp                          |  2 +-
 2 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index cf42855eb..b20a16cd0 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -96,12 +96,6 @@ static void run(int rows, int cols, int depth,
     Scalar* w = ei_aligned_stack_new(Scalar, sizeW);
     Scalar* blockB = (Scalar*)info[tid].blockB;
 
-    // if you have the GOTO blas library you can try our parallelization strategy
-    // using GOTO's optimized routines.
-    #ifdef USEGOTOROUTINES
-    void* u = alloca(4096+sizeW);
-    #endif
-
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
     // (==GEMM_VAR1)
     for(int k=0; k<depth; k+=kc)
@@ -110,28 +104,18 @@ static void run(int rows, int cols, int depth,
 
       // In order to reduce the chance that a thread has to wait for the other,
       // let's start by packing A'.
-      #ifndef USEGOTOROUTINES
       pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
-      #else
-      sgemm_itcopy(actual_kc, mc, &lhs(0,k), lhsStride, blockA);
-      #endif
-
 
       // Pack B_k to B' in parallel fashion:
       // each thread packs the sub block B_k,j to B'_j where j is the thread id.
 
-
       // However, before copying to B'_j, we have to make sure that no other thread is still using it,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
       info[tid].users += threads;
 
-      #ifndef USEGOTOROUTINES
       pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length);
-      #else
-      sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc);
-      #endif
 
       // Notify the other threads that the part B'_j is ready to go.
       info[tid].sync = k;
@@ -147,12 +131,7 @@ static void run(int rows, int cols, int depth,
         if(shift>0)
           while(info[j].sync!=k) {}
 
-        #ifndef USEGOTOROUTINES
         gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w);
-        #else
-        sgemm_kernel(mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride);
-        #endif
-
       }
 
       // Then keep going as usual with the remaining A'
@@ -161,18 +140,10 @@ static void run(int rows, int cols, int depth,
         const int actual_mc = std::min(i+mc,rows)-i;
 
         // pack A_i,k to A'
-        #ifndef USEGOTOROUTINES
         pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
-        #else
-        sgemm_itcopy(actual_kc, actual_mc, &lhs(i,k), lhsStride, blockA);
-        #endif
 
         // C_i += A' * B'
-        #ifndef USEGOTOROUTINES
         gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1,-1,0,0, w);
-        #else
-        sgemm_kernel(actual_mc, cols, actual_kc, alpha, blockA, blockB, res+i, resStride);
-        #endif
       }
 
       // Release all the sub blocks B'_j of B' for the current thread,
@@ -188,6 +159,7 @@ static void run(int rows, int cols, int depth,
   else
 #endif // EIGEN_HAS_OPENMP
   {
+    (void)info; // info is not used
     // this is the sequential version!
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
     std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*cols;
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 653a880a8..3cb75c17a 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -3,7 +3,7 @@
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 
 #include <Eigen/Core>
-
+#include <iostream>
 #include <bench/BenchTimer.h>
 
 using namespace std;

From d13b877014928c80a7cf0ae2e563d4e2e60e2c3c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 10:04:17 +0100
Subject: [PATCH 099/122] remove the 1D and 2D parallelizer, keep only the GEMM
 specialized one

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h |  6 +-
 Eigen/src/Core/products/Parallelizer.h        | 72 +------------------
 2 files changed, 5 insertions(+), 73 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index b20a16cd0..cbb389542 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -273,11 +273,9 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
           (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
         _ActualLhsType,
         _ActualRhsType,
-        Dest> Functor;
+        Dest> GemmFunctor;
 
-//       ei_run_parallel_1d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows());
-//       ei_run_parallel_2d<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
-      ei_run_parallel_gemm<true>(Functor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+      ei_parallelize_gemm<Dest::MaxRowsAtCompileTime>32>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 439ce1565..62cf16047 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,71 +25,6 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-template<bool Parallelize,typename Functor>
-void ei_run_parallel_1d(const Functor& func, int size)
-{
-#ifndef EIGEN_HAS_OPENMP
-  func(0,size);
-#else
-  if(!Parallelize)
-    return func(0,size);
-
-  int threads = omp_get_num_procs();
-  int blockSize = size / threads;
-  #pragma omp parallel for schedule(static,1)
-  for(int i=0; i<threads; ++i)
-  {
-    int blockStart = i*blockSize;
-    int actualBlockSize = std::min(blockSize, size - blockStart);
-
-    func(blockStart, actualBlockSize);
-  }
-#endif
-}
-
-template<bool Parallelize,typename Functor>
-void ei_run_parallel_2d(const Functor& func, int size1, int size2)
-{
-#ifndef EIGEN_HAS_OPENMP
-  func(0,size1, 0,size2);
-#else
-
-  int threads = omp_get_max_threads();
-  if((!Parallelize)||(threads==1))
-    return func(0,size1, 0,size2);
-
-                                // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-  static const int divide1[17] = { 0, 1, 2, 3, 2, 5, 3, 7, 4, 3,  5,  1,  4,  1,  7,  5, 4};
-  static const int divide2[17] = { 0, 1, 1, 1, 2, 1, 2, 1, 2, 3,  2, 11,  3, 13,  2,  3, 4};
-
-
-
-  ei_assert(threads<=16 && "too many threads !");
-  int blockSize1 = size1 / divide1[threads];
-  int blockSize2 = size2 / divide2[threads];
-
-  Matrix<int,4,Dynamic> ranges(4,threads);
-  int k = 0;
-  for(int i1=0; i1<divide1[threads]; ++i1)
-  {
-    int blockStart1 = i1*blockSize1;
-    int actualBlockSize1 = std::min(blockSize1, size1 - blockStart1);
-    for(int i2=0; i2<divide2[threads]; ++i2)
-    {
-      int blockStart2 = i2*blockSize2;
-      int actualBlockSize2 = std::min(blockSize2, size2 - blockStart2);
-      ranges.col(k++) << blockStart1, actualBlockSize1, blockStart2, actualBlockSize2;
-    }
-  }
-
-  #pragma omp parallel for schedule(static,1)
-  for(int i=0; i<threads; ++i)
-  {
-    func(ranges.col(i)[0],ranges.col(i)[1],ranges.col(i)[2],ranges.col(i)[3]);
-  }
-#endif
-}
-
 struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0) {}
@@ -102,18 +37,17 @@ struct GemmParallelInfo
   float* blockB;
 };
 
-template<bool Parallelize,typename Functor>
-void ei_run_parallel_gemm(const Functor& func, int rows, int cols)
+template<bool Condition,typename Functor>
+void ei_parallelize_gemm(const Functor& func, int rows, int cols)
 {
 #ifndef EIGEN_HAS_OPENMP
   func(0,rows, 0,cols);
 #else
 
   int threads = omp_get_max_threads();
-  if((!Parallelize)||(threads==1))
+  if((!Condition)||(threads==1))
     return func(0,rows, 0,cols);
 
-
   int blockCols = (cols / threads) & ~0x3;
   int blockRows = (rows / threads) & ~0x7;
 

From 62ac0216060045619ff1e6035643ecf9dbefa14f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 10:16:25 +0100
Subject: [PATCH 100/122] fix openmp version for scalar types different than
 float

---
 Eigen/src/Core/products/GeneralMatrixMatrix.h | 16 ++++++++++++----
 Eigen/src/Core/products/Parallelizer.h        | 11 ++++++-----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index cbb389542..c1d42d387 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -40,7 +40,7 @@ struct ei_general_matrix_matrix_product<Scalar,LhsStorageOrder,ConjugateLhs,RhsS
     const Scalar* rhs, int rhsStride,
     Scalar* res, int resStride,
     Scalar alpha,
-    GemmParallelInfo* info = 0)
+    GemmParallelInfo<Scalar>* info = 0)
   {
     // transpose the product such that the result is column major
     ei_general_matrix_matrix_product<Scalar,
@@ -66,7 +66,7 @@ static void run(int rows, int cols, int depth,
   const Scalar* _rhs, int rhsStride,
   Scalar* res, int resStride,
   Scalar alpha,
-  GemmParallelInfo* info = 0)
+  GemmParallelInfo<Scalar>* info = 0)
 {
   ei_const_blas_data_mapper<Scalar, LhsStorageOrder> lhs(_lhs,lhsStride);
   ei_const_blas_data_mapper<Scalar, RhsStorageOrder> rhs(_rhs,rhsStride);
@@ -218,11 +218,13 @@ struct ei_traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
 template<typename Scalar, typename Gemm, typename Lhs, typename Rhs, typename Dest>
 struct ei_gemm_functor
 {
+  typedef typename Rhs::Scalar BlockBScalar;
+
   ei_gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha)
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha)
   {}
 
-  void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo* info=0) const
+  void operator() (int row, int rows, int col=0, int cols=-1, GemmParallelInfo<BlockBScalar>* info=0) const
   {
     if(cols==-1)
       cols = m_rhs.cols();
@@ -234,6 +236,12 @@ struct ei_gemm_functor
               info);
   }
 
+
+  int sharedBlockBSize() const
+  {
+    return std::min<int>(ei_product_blocking_traits<Scalar>::Max_kc,m_rhs.rows()) * m_rhs.cols();
+  }
+
   protected:
     const Lhs& m_lhs;
     const Rhs& m_rhs;
@@ -275,7 +283,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
         _ActualRhsType,
         Dest> GemmFunctor;
 
-      ei_parallelize_gemm<Dest::MaxRowsAtCompileTime>32>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
+      ei_parallelize_gemm<(Dest::MaxRowsAtCompileTime>32)>(GemmFunctor(lhs, rhs, dst, actualAlpha), this->rows(), this->cols());
     }
 };
 
diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 62cf16047..03d85c1ce 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -25,16 +25,16 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-struct GemmParallelInfo
+template<typename BlockBScalar> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0) {}
+  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0), blockB(0) {}
 
   int volatile sync;
   int volatile users;
 
   int rhs_start;
   int rhs_length;
-  float* blockB;
+  BlockBScalar* blockB;
 };
 
 template<bool Condition,typename Functor>
@@ -51,9 +51,10 @@ void ei_parallelize_gemm(const Functor& func, int rows, int cols)
   int blockCols = (cols / threads) & ~0x3;
   int blockRows = (rows / threads) & ~0x7;
 
-  float* sharedBlockB = new float[2048*2048*4];
+  typedef typename Functor::BlockBScalar BlockBScalar;
+  BlockBScalar* sharedBlockB = new BlockBScalar[func.sharedBlockBSize()];
 
-  GemmParallelInfo* info = new GemmParallelInfo[threads];
+  GemmParallelInfo<BlockBScalar>* info = new GemmParallelInfo<BlockBScalar>[threads];
 
   #pragma omp parallel for schedule(static,1)
   for(int i=0; i<threads; ++i)

From dd961f8c60ed684b7e3683b348544fc28f391d8f Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 10:22:27 +0100
Subject: [PATCH 101/122] add an option to test ompenmp

---
 CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4016de370..4e9c4533d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,15 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     message("Enabling NEON in tests/examples")
   endif()
 
+  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
+  if(COMPILER_SUPPORT_OPENMP)
+    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
+    if(EIGEN_TEST_OPENMP)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+      message("Enabling OpenMP in tests/examples")
+    endif()
+  endif()
+
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
 if(MSVC)

From 48d0595c29eef24ef98b82d23ed075de4819e39c Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 10:44:31 +0100
Subject: [PATCH 102/122] * dynamically adjust the number of threads * disbale
 parallelisation if we already are in a parallel session

---
 Eigen/src/Core/products/Parallelizer.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 03d85c1ce..304dc7ed0 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -44,8 +44,24 @@ void ei_parallelize_gemm(const Functor& func, int rows, int cols)
   func(0,rows, 0,cols);
 #else
 
-  int threads = omp_get_max_threads();
-  if((!Condition)||(threads==1))
+  // Dynamically check whether we should enable or disable OpenMP.
+  // The conditions are:
+  // - the max number of threads we can create is greater than 1
+  // - we are not already in a parallel code
+  // - the sizes are large enough
+
+  // 1- are we already in a parallel session?
+  if((!Condition) || (omp_get_num_threads()>1))
+    return func(0,rows, 0,cols);
+
+  // 2- compute the maximal number of threads from the size of the product:
+  // FIXME this has to be fine tuned
+  int max_threads = std::max(1,rows / 32);
+
+  // 3 - compute the number of threads we are going to use
+  int threads = std::min(omp_get_max_threads(), max_threads);
+
+  if(threads==1)
     return func(0,rows, 0,cols);
 
   int blockCols = (cols / threads) & ~0x3;
@@ -56,7 +72,7 @@ void ei_parallelize_gemm(const Functor& func, int rows, int cols)
 
   GemmParallelInfo<BlockBScalar>* info = new GemmParallelInfo<BlockBScalar>[threads];
 
-  #pragma omp parallel for schedule(static,1)
+  #pragma omp parallel for schedule(static,1) num_threads(threads)
   for(int i=0; i<threads; ++i)
   {
     int r0 = i*blockRows;

From 5f172cd01f8e4adb790977b4ea5b9da660ca790e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 10:45:29 +0100
Subject: [PATCH 103/122] add a FIXME

---
 Eigen/src/Core/products/Parallelizer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index 304dc7ed0..6e15e0ff5 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@@ -51,6 +51,7 @@ void ei_parallelize_gemm(const Functor& func, int rows, int cols)
   // - the sizes are large enough
 
   // 1- are we already in a parallel session?
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
   if((!Condition) || (omp_get_num_threads()>1))
     return func(0,rows, 0,cols);
 

From c44220835864d2994a2a2b030100125fa00aa378 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 11:35:43 +0100
Subject: [PATCH 104/122] clean a bit the bench_gemm files

---
 bench/bench_gemm.cpp      |   9 ++--
 bench/bench_gemm_blas.cpp | 109 --------------------------------------
 2 files changed, 3 insertions(+), 115 deletions(-)
 delete mode 100644 bench/bench_gemm_blas.cpp

diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 3cb75c17a..5c55d4b7c 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -20,11 +20,6 @@ typedef Matrix<Scalar,Dynamic,Dynamic> M;
 
 extern "C" {
   #include <bench/btl/libs/C_BLAS/blas.h>
-
-  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
-                    float* blockA, float* blockB, float* res, int resStride);
-  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
-  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
 }
 
 static float fone = 1;
@@ -72,7 +67,9 @@ int main(int argc, char ** argv)
   int rep = 1;    // number of repetitions per try
   int tries = 5;  // number of tries, we keep the best
 
-  int s = 2048;
+  int s = argc==2 ? std::atoi(argv[1]) : 2048;
+  std::cout << "Matrix size = " << s << "\n";
+
   int m = s;
   int n = s;
   int p = s;
diff --git a/bench/bench_gemm_blas.cpp b/bench/bench_gemm_blas.cpp
deleted file mode 100644
index 254302312..000000000
--- a/bench/bench_gemm_blas.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-
-#include <Eigen/Core>
-#include <bench/BenchTimer.h>
-
-extern "C"
-{
-  #include <bench/btl/libs/C_BLAS/blas.h>
-  #include <cblas.h>
-
-  void sgemm_kernel(int actual_mc, int cols, int actual_kc, float alpha,
-                   float* blockA, float* blockB, float* res, int resStride);
-
-  void sgemm_otcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
-  void sgemm_oncopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
-  void sgemm_itcopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
-  void sgemm_incopy(int actual_kc, int cols, const float* rhs, int rhsStride, float* blockB);
-}
-
-using namespace std;
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR float
-#endif
-
-typedef SCALAR Scalar;
-typedef Matrix<Scalar,Dynamic,Dynamic> M;
-
-static float fone = 1;
-static float fzero = 0;
-static double done = 1;
-static double szero = 0;
-static char notrans = 'N';
-static char trans = 'T';
-static char nonunit = 'N';
-static char lower = 'L';
-static char right = 'R';
-static int intone = 1;
-
-void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-
-//   c.noalias() += a * b;
-  sgemm_(&notrans,&notrans,&M,&N,&K,&fone,
-         const_cast<float*>(a.data()),&lda,
-         const_cast<float*>(b.data()),&ldb,&fone,
-         c.data(),&ldc);
-}
-
-void blas_gemm(const MatrixXd& a, const MatrixXd& b, MatrixXd& c)
-{
-  int M = c.rows();
-  int N = c.cols();
-  int K = a.cols();
-
-  int lda = a.rows();
-  int ldb = b.rows();
-  int ldc = c.rows();
-
-//   c.noalias() += a * b;
-
-  dgemm_(&notrans,&notrans,&M,&N,&K,&done,
-         const_cast<double*>(a.data()),&lda,
-         const_cast<double*>(b.data()),&ldb,&done,
-         c.data(),&ldc);
-}
-
-int main(int argc, char **argv)
-{
-  int rep = 1;
-  int s = 2048;
-  int m = s;
-  int n = s;
-  int p = s;
-  const int N = 1;
-  M a[N];
-  M b[N];
-  M c[N];
-
-  for (int k=0; k<N; ++k)
-  {
-    a[k].resize(m,p); a[k].setOnes();
-    b[k].resize(p,n); b[k].setOnes();
-    c[k].resize(m,n); c[k].setOnes();
-  }
-
-  BenchTimer t;
-
-  BENCH(t, 5, rep,
-        for(int k=0;k<N;++k)
-          blas_gemm(a[k],b[k],c[k]));
-
-//   BENCH(t, 5, rep,
-//         _Pragma("omp parallel for schedule(static,1)")
-//         for(int k=0;k<N;++k)
-//           blas_gemm(a[k],b[k],c[k]));
-
-  std::cerr << "cpu   " << t.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*N*n*p*rep*2/t.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << t.total(CPU_TIMER)  << "s)\n";
-  std::cerr << "real  " << t.best(REAL_TIMER)/rep << "s  \t" << (double(m)*N*n*p*rep*2/t.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << t.total(REAL_TIMER) << "s)\n";
-  return 0;
-}
-

From f2a246c2258c400993a1bf52999a4b8e29bf4ff1 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 17:16:19 +0100
Subject: [PATCH 105/122] add a small program to bench all combinations of
 small products

---
 Eigen/src/Core/Product.h    |  46 ++++++------
 bench/product_threshold.cpp | 143 ++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 23 deletions(-)
 create mode 100644 bench/product_threshold.cpp

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 865387b11..f814382a9 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -84,28 +84,28 @@ public:
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
-template<int Rows, int Cols>  struct ei_product_type_selector<Rows, Cols, 1>      { enum { ret = OuterProduct }; };
-template<int Depth>           struct ei_product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
-template<>                    struct ei_product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
-template<>                    struct ei_product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
-template<>                    struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
-template<>                    struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>                    struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
-template<>                    struct ei_product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Large,Large,1>      { enum { ret = OuterProduct }; };
+template<int Depth> struct ei_product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
+template<>          struct ei_product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
+template<>          struct ei_product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
+template<>          struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
+template<>          struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>          struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>          struct ei_product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
 
 /** \class ProductReturnType
   *
@@ -298,7 +298,7 @@ struct ei_gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
   {
     Transpose<Dest> destT(dest);
     ei_gemv_selector<OnTheRight,!StorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<typename ProductType::_RhsNested>,Transpose<typename ProductType::_LhsNested> >
+      ::run(GeneralProduct<Transpose<typename ProductType::_RhsNested>,Transpose<typename ProductType::_LhsNested>, GemvProduct>
         (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
   }
 };
diff --git a/bench/product_threshold.cpp b/bench/product_threshold.cpp
new file mode 100644
index 000000000..9e73731e8
--- /dev/null
+++ b/bench/product_threshold.cpp
@@ -0,0 +1,143 @@
+
+#include <iostream>
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+
+using namespace Eigen;
+using namespace std;
+
+#define END 9
+
+template<int S> struct map_size { enum { ret = S }; };
+template<>  struct map_size<10> { enum { ret = 20 }; };
+template<>  struct map_size<11> { enum { ret = 50 }; };
+template<>  struct map_size<12> { enum { ret = 100 }; };
+template<>  struct map_size<13> { enum { ret = 300 }; };
+
+template<int M, int N,int K> struct alt_prod
+{
+  enum {
+    ret = M==1 && N==1 ? InnerProduct
+        : K==1 ? OuterProduct
+        : M==1 ? GemvProduct
+        : N==1 ? GemvProduct
+        : GemmProduct
+  };
+};
+        
+void print_mode(int mode)
+{
+  if(mode==InnerProduct) std::cout << "i";
+  if(mode==OuterProduct) std::cout << "o";
+  if(mode==CoeffBasedProductMode) std::cout << "c";
+  if(mode==LazyCoeffBasedProductMode) std::cout << "l";
+  if(mode==GemvProduct) std::cout << "v";
+  if(mode==GemmProduct) std::cout << "m";
+}
+
+template<int Mode, typename Lhs, typename Rhs, typename Res>
+EIGEN_DONT_INLINE void prod(const Lhs& a, const Rhs& b, Res& c)
+{
+  c.noalias() += typename ProductReturnType<Lhs,Rhs,Mode>::Type(a,b);
+}
+
+template<int M, int N, int K, typename Scalar, int Mode>
+EIGEN_DONT_INLINE void bench_prod()
+{
+  typedef Matrix<Scalar,M,K> Lhs; Lhs a; a.setRandom();
+  typedef Matrix<Scalar,K,N> Rhs; Rhs b; b.setRandom();
+  typedef Matrix<Scalar,M,N> Res; Res c; c.setRandom();
+
+  BenchTimer t;
+  double n = 2.*double(M)*double(N)*double(K);
+  int rep = 100000./n;
+  rep /= 2;
+  if(rep<1) rep = 1;
+  do {
+    rep *= 2;
+    t.reset();
+    BENCH(t,1,rep,prod<CoeffBasedProductMode>(a,b,c));
+  } while(t.best()<0.1);
+  
+  t.reset();
+  BENCH(t,5,rep,prod<Mode>(a,b,c));
+
+  print_mode(Mode);
+  std::cout << int(1e-6*n*rep/t.best()) << "\t";
+}
+
+template<int N> struct print_n;
+template<int M, int N, int K> struct loop_on_m;
+template<int M, int N, int K, typename Scalar, int Mode> struct loop_on_n;
+
+template<int M, int N, int K>
+struct loop_on_k
+{
+  static void run()
+  {
+    std::cout << "K=" << K << "\t";
+    print_n<N>::run();
+    std::cout << "\n";
+
+    loop_on_m<M,N,K>::run();
+    std::cout << "\n\n";
+
+    loop_on_k<M,N,K+1>::run();
+  }
+};
+
+template<int M, int N>
+struct loop_on_k<M,N,END> { static void run(){} };
+
+
+template<int M, int N, int K>
+struct loop_on_m
+{
+  static void run()
+  {
+    std::cout << M << "f\t";
+    loop_on_n<M,N,K,float,CoeffBasedProductMode>::run();
+    std::cout << "\n";
+    
+    std::cout << M << "f\t";
+    loop_on_n<M,N,K,float,-1>::run();
+    std::cout << "\n";
+
+    loop_on_m<M+1,N,K>::run();
+  }
+};
+
+template<int N, int K>
+struct loop_on_m<END,N,K> { static void run(){} };
+
+template<int M, int N, int K, typename Scalar, int Mode>
+struct loop_on_n
+{
+  static void run()
+  {
+    bench_prod<M,N,K,Scalar,Mode==-1? alt_prod<M,N,K>::ret : Mode>();
+    
+    loop_on_n<M,N+1,K,Scalar,Mode>::run();
+  }
+};
+
+template<int M, int K, typename Scalar, int Mode>
+struct loop_on_n<M,END,K,Scalar,Mode> { static void run(){} };
+
+template<int N> struct print_n
+{
+  static void run()
+  {
+    std::cout << map_size<N>::ret << "\t";
+    print_n<N+1>::run();
+  }
+};
+
+template<> struct print_n<END> { static void run(){} };
+
+int main()
+{
+  loop_on_k<1,1,1>::run();
+  
+  return 0; 
+}

From 51b0159c96ed47181c14d269154be72340261c5e Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Fri, 5 Mar 2010 18:11:54 +0100
Subject: [PATCH 106/122] Fixed line endings.

---
 bench/product_threshold.cpp | 286 ++++++++++++++++++------------------
 1 file changed, 143 insertions(+), 143 deletions(-)

diff --git a/bench/product_threshold.cpp b/bench/product_threshold.cpp
index 9e73731e8..dd6d15a07 100644
--- a/bench/product_threshold.cpp
+++ b/bench/product_threshold.cpp
@@ -1,143 +1,143 @@
-
-#include <iostream>
-#include <Eigen/Core>
-#include <bench/BenchTimer.h>
-
-using namespace Eigen;
-using namespace std;
-
-#define END 9
-
-template<int S> struct map_size { enum { ret = S }; };
-template<>  struct map_size<10> { enum { ret = 20 }; };
-template<>  struct map_size<11> { enum { ret = 50 }; };
-template<>  struct map_size<12> { enum { ret = 100 }; };
-template<>  struct map_size<13> { enum { ret = 300 }; };
-
-template<int M, int N,int K> struct alt_prod
-{
-  enum {
-    ret = M==1 && N==1 ? InnerProduct
-        : K==1 ? OuterProduct
-        : M==1 ? GemvProduct
-        : N==1 ? GemvProduct
-        : GemmProduct
-  };
-};
-        
-void print_mode(int mode)
-{
-  if(mode==InnerProduct) std::cout << "i";
-  if(mode==OuterProduct) std::cout << "o";
-  if(mode==CoeffBasedProductMode) std::cout << "c";
-  if(mode==LazyCoeffBasedProductMode) std::cout << "l";
-  if(mode==GemvProduct) std::cout << "v";
-  if(mode==GemmProduct) std::cout << "m";
-}
-
-template<int Mode, typename Lhs, typename Rhs, typename Res>
-EIGEN_DONT_INLINE void prod(const Lhs& a, const Rhs& b, Res& c)
-{
-  c.noalias() += typename ProductReturnType<Lhs,Rhs,Mode>::Type(a,b);
-}
-
-template<int M, int N, int K, typename Scalar, int Mode>
-EIGEN_DONT_INLINE void bench_prod()
-{
-  typedef Matrix<Scalar,M,K> Lhs; Lhs a; a.setRandom();
-  typedef Matrix<Scalar,K,N> Rhs; Rhs b; b.setRandom();
-  typedef Matrix<Scalar,M,N> Res; Res c; c.setRandom();
-
-  BenchTimer t;
-  double n = 2.*double(M)*double(N)*double(K);
-  int rep = 100000./n;
-  rep /= 2;
-  if(rep<1) rep = 1;
-  do {
-    rep *= 2;
-    t.reset();
-    BENCH(t,1,rep,prod<CoeffBasedProductMode>(a,b,c));
-  } while(t.best()<0.1);
-  
-  t.reset();
-  BENCH(t,5,rep,prod<Mode>(a,b,c));
-
-  print_mode(Mode);
-  std::cout << int(1e-6*n*rep/t.best()) << "\t";
-}
-
-template<int N> struct print_n;
-template<int M, int N, int K> struct loop_on_m;
-template<int M, int N, int K, typename Scalar, int Mode> struct loop_on_n;
-
-template<int M, int N, int K>
-struct loop_on_k
-{
-  static void run()
-  {
-    std::cout << "K=" << K << "\t";
-    print_n<N>::run();
-    std::cout << "\n";
-
-    loop_on_m<M,N,K>::run();
-    std::cout << "\n\n";
-
-    loop_on_k<M,N,K+1>::run();
-  }
-};
-
-template<int M, int N>
-struct loop_on_k<M,N,END> { static void run(){} };
-
-
-template<int M, int N, int K>
-struct loop_on_m
-{
-  static void run()
-  {
-    std::cout << M << "f\t";
-    loop_on_n<M,N,K,float,CoeffBasedProductMode>::run();
-    std::cout << "\n";
-    
-    std::cout << M << "f\t";
-    loop_on_n<M,N,K,float,-1>::run();
-    std::cout << "\n";
-
-    loop_on_m<M+1,N,K>::run();
-  }
-};
-
-template<int N, int K>
-struct loop_on_m<END,N,K> { static void run(){} };
-
-template<int M, int N, int K, typename Scalar, int Mode>
-struct loop_on_n
-{
-  static void run()
-  {
-    bench_prod<M,N,K,Scalar,Mode==-1? alt_prod<M,N,K>::ret : Mode>();
-    
-    loop_on_n<M,N+1,K,Scalar,Mode>::run();
-  }
-};
-
-template<int M, int K, typename Scalar, int Mode>
-struct loop_on_n<M,END,K,Scalar,Mode> { static void run(){} };
-
-template<int N> struct print_n
-{
-  static void run()
-  {
-    std::cout << map_size<N>::ret << "\t";
-    print_n<N+1>::run();
-  }
-};
-
-template<> struct print_n<END> { static void run(){} };
-
-int main()
-{
-  loop_on_k<1,1,1>::run();
-  
-  return 0; 
-}
+
+#include <iostream>
+#include <Eigen/Core>
+#include <bench/BenchTimer.h>
+
+using namespace Eigen;
+using namespace std;
+
+#define END 9
+
+template<int S> struct map_size { enum { ret = S }; };
+template<>  struct map_size<10> { enum { ret = 20 }; };
+template<>  struct map_size<11> { enum { ret = 50 }; };
+template<>  struct map_size<12> { enum { ret = 100 }; };
+template<>  struct map_size<13> { enum { ret = 300 }; };
+
+template<int M, int N,int K> struct alt_prod
+{
+  enum {
+    ret = M==1 && N==1 ? InnerProduct
+        : K==1 ? OuterProduct
+        : M==1 ? GemvProduct
+        : N==1 ? GemvProduct
+        : GemmProduct
+  };
+};
+        
+void print_mode(int mode)
+{
+  if(mode==InnerProduct) std::cout << "i";
+  if(mode==OuterProduct) std::cout << "o";
+  if(mode==CoeffBasedProductMode) std::cout << "c";
+  if(mode==LazyCoeffBasedProductMode) std::cout << "l";
+  if(mode==GemvProduct) std::cout << "v";
+  if(mode==GemmProduct) std::cout << "m";
+}
+
+template<int Mode, typename Lhs, typename Rhs, typename Res>
+EIGEN_DONT_INLINE void prod(const Lhs& a, const Rhs& b, Res& c)
+{
+  c.noalias() += typename ProductReturnType<Lhs,Rhs,Mode>::Type(a,b);
+}
+
+template<int M, int N, int K, typename Scalar, int Mode>
+EIGEN_DONT_INLINE void bench_prod()
+{
+  typedef Matrix<Scalar,M,K> Lhs; Lhs a; a.setRandom();
+  typedef Matrix<Scalar,K,N> Rhs; Rhs b; b.setRandom();
+  typedef Matrix<Scalar,M,N> Res; Res c; c.setRandom();
+
+  BenchTimer t;
+  double n = 2.*double(M)*double(N)*double(K);
+  int rep = 100000./n;
+  rep /= 2;
+  if(rep<1) rep = 1;
+  do {
+    rep *= 2;
+    t.reset();
+    BENCH(t,1,rep,prod<CoeffBasedProductMode>(a,b,c));
+  } while(t.best()<0.1);
+  
+  t.reset();
+  BENCH(t,5,rep,prod<Mode>(a,b,c));
+
+  print_mode(Mode);
+  std::cout << int(1e-6*n*rep/t.best()) << "\t";
+}
+
+template<int N> struct print_n;
+template<int M, int N, int K> struct loop_on_m;
+template<int M, int N, int K, typename Scalar, int Mode> struct loop_on_n;
+
+template<int M, int N, int K>
+struct loop_on_k
+{
+  static void run()
+  {
+    std::cout << "K=" << K << "\t";
+    print_n<N>::run();
+    std::cout << "\n";
+
+    loop_on_m<M,N,K>::run();
+    std::cout << "\n\n";
+
+    loop_on_k<M,N,K+1>::run();
+  }
+};
+
+template<int M, int N>
+struct loop_on_k<M,N,END> { static void run(){} };
+
+
+template<int M, int N, int K>
+struct loop_on_m
+{
+  static void run()
+  {
+    std::cout << M << "f\t";
+    loop_on_n<M,N,K,float,CoeffBasedProductMode>::run();
+    std::cout << "\n";
+    
+    std::cout << M << "f\t";
+    loop_on_n<M,N,K,float,-1>::run();
+    std::cout << "\n";
+
+    loop_on_m<M+1,N,K>::run();
+  }
+};
+
+template<int N, int K>
+struct loop_on_m<END,N,K> { static void run(){} };
+
+template<int M, int N, int K, typename Scalar, int Mode>
+struct loop_on_n
+{
+  static void run()
+  {
+    bench_prod<M,N,K,Scalar,Mode==-1? alt_prod<M,N,K>::ret : Mode>();
+    
+    loop_on_n<M,N+1,K,Scalar,Mode>::run();
+  }
+};
+
+template<int M, int K, typename Scalar, int Mode>
+struct loop_on_n<M,END,K,Scalar,Mode> { static void run(){} };
+
+template<int N> struct print_n
+{
+  static void run()
+  {
+    std::cout << map_size<N>::ret << "\t";
+    print_n<N+1>::run();
+  }
+};
+
+template<> struct print_n<END> { static void run(){} };
+
+int main()
+{
+  loop_on_k<1,1,1>::run();
+  
+  return 0; 
+}

From 273b236f72a3d1cb45d689a2d79b18d1eb5f3bf7 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Fri, 5 Mar 2010 22:28:49 +0200
Subject: [PATCH 107/122] Altivec brought up to date. Most tests pass and
 performance is better than before too!

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 481 ++++++++++++-----------
 1 file changed, 257 insertions(+), 224 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 449de2078..f5fbeb5d8 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -29,34 +29,80 @@
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-typedef __vector float          v4f;
-typedef __vector int            v4i;
-typedef __vector unsigned int   v4ui;
-typedef __vector __bool int     v4bi;
+#ifndef EIGEN_HAS_FUSE_CJMADD
+#define EIGEN_HAS_FUSE_CJMADD 1
+#endif 
+
+#ifndef EIGEN_TUNE_FOR_CPU_CACHE_SIZE
+#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128
+#endif
+
+// FIXME NEON has 16 quad registers, but since the current register allocator
+// is so bad, it is much better to reduce it to 8
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+
+typedef __vector float          Packet4f;
+typedef __vector int            Packet4i;
+typedef __vector unsigned int   Packet4ui;
+typedef __vector __bool int     Packet4bi;
+typedef __vector short int      Packet8i;
+typedef __vector unsigned char  Packet16uc;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
 
-#define USE_CONST_v0i     const v4i   v0i   = vec_splat_s32(0)
-#define USE_CONST_v1i     const v4i   v1i   = vec_splat_s32(1)
-#define USE_CONST_v16i_   const v4i   v16i_ = vec_splat_s32(-16)
-#define USE_CONST_v0f     USE_CONST_v0i; const v4f v0f = (v4f) v0i
-#define USE_CONST_v1f     USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0)
-#define USE_CONST_v1i_    const v4ui  v1i_  = vec_splat_u32(-1)
-#define USE_CONST_v0f_    USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_)
+#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
+  Packet4f ei_p4f_##NAME = (Packet4f) vec_splat_s32(X)
 
-template<> struct ei_packet_traits<float> : ei_default_packet_traits
-{ typedef v4f type; enum {size=4}; };
-template<> struct ei_packet_traits<int>   : ei_default_packet_traits
-{ typedef v4i type; enum {size=4}; };
+#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
+  Packet4i ei_p4i_##NAME = vec_splat_s32(X)
 
-template<> struct ei_unpacket_traits<v4f>  { typedef float  type; enum {size=4}; };
-template<> struct ei_unpacket_traits<v4i>  { typedef int    type; enum {size=4}; };
+#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
+  Packet4f ei_p4f_##NAME = ei_pset1<float>(X)
 
-inline std::ostream & operator <<(std::ostream & s, const v4f & v)
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  Packet4f ei_p4f_##NAME = vreinterpretq_f32_u32(ei_pset1<int>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
+  Packet4i ei_p4i_##NAME = ei_pset1<int>(X)
+
+
+// Define global static constants:
+static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
+static Packet4i ei_p4i_COUNTDOWN = { 3, 2, 1, 0 };
+static Packet16uc ei_p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
+static Packet4f ei_p4f_ONE = vec_ctf(ei_p4i_ONE, 0);
+static Packet4f ei_p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)ei_p4i_MINUS1, (Packet4ui)ei_p4i_MINUS1);
+
+template<> struct ei_packet_traits<float>  : ei_default_packet_traits
+{
+  typedef Packet4f type; enum {size=4};
+  enum {
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+template<> struct ei_packet_traits<int>    : ei_default_packet_traits
+{ typedef Packet4i type; enum {size=4}; };
+
+template<> struct ei_unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct ei_unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+/*
+inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
   union {
-    v4f   v;
+    Packet4f   v;
     float n[4];
   } vt;
   vt.v = v;
@@ -64,10 +110,10 @@ inline std::ostream & operator <<(std::ostream & s, const v4f & v)
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const v4i & v)
+inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
 {
   union {
-    v4i   v;
+    Packet4i   v;
     int n[4];
   } vt;
   vt.v = v;
@@ -75,10 +121,10 @@ inline std::ostream & operator <<(std::ostream & s, const v4i & v)
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
+inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
 {
   union {
-    v4ui   v;
+    Packet4ui   v;
     unsigned int n[4];
   } vt;
   vt.v = v;
@@ -86,65 +132,73 @@ inline std::ostream & operator <<(std::ostream & s, const v4ui & v)
   return s;
 }
 
-inline std::ostream & operator <<(std::ostream & s, const v4bi & v)
+inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
 {
   union {
-    __vector __bool int v;
+    Packet4bi v;
     unsigned int n[4];
   } vt;
   vt.v = v;
   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
   return s;
 }
-
-template<> inline v4f  ei_padd(const v4f&   a, const v4f&   b) { return vec_add(a,b); }
-template<> inline v4i  ei_padd(const v4i&   a, const v4i&   b) { return vec_add(a,b); }
-
-template<> inline v4f  ei_psub(const v4f&   a, const v4f&   b) { return vec_sub(a,b); }
-template<> inline v4i  ei_psub(const v4i&   a, const v4i&   b) { return vec_sub(a,b); }
-
-template<> EIGEN_STRONG_INLINE v4f ei_pnegate(const v4f& a)
-{
-  v4i mask = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
-  return vec_xor(a,(v4f) mask);
+*/
+template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from;
+  Packet4f vc = vec_ld(0, af);
+  vc = vec_splat(vc, 0);
+  return vc;
 }
 
-template<> EIGEN_STRONG_INLINE v4i ei_pnegate(const v4i& a)
-{
-  USE_CONST_v0i;
-  return ei_psub(v0i, a);
+template<> EIGEN_STRONG_INLINE Packet4i ei_pset1<int>(const int&    from)   { 
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from;
+  Packet4i vc = vec_ld(0, ai);
+  vc = vec_splat(vc, 0);
+  return vc;
 }
 
-template<> inline v4f  ei_pmul(const v4f&   a, const v4f&   b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
-template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
+template<> EIGEN_STRONG_INLINE Packet4f ei_plset<float>(const float& a) { return vec_add(ei_pset1(a), ei_p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_plset<int>(const int& a)     { return vec_add(ei_pset1(a), ei_p4i_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pnegate(const Packet4f& a) { return ei_psub<Packet4f>(ei_p4f_ZERO, a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pnegate(const Packet4i& a) { return ei_psub<Packet4i>(ei_p4i_ZERO, a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,ei_p4f_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
   // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
   //Set up constants, variables
-  v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
-  USE_CONST_v0i;
-  USE_CONST_v1i;
-  USE_CONST_v16i_;
+  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
 
   // Get the absolute values
   a1  = vec_abs(a);
   b1  = vec_abs(b);
 
   // Get the signs using xor
-  v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
+  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), ei_p4i_ZERO);
 
   // Do the multiplication for the asbolute values.
-  bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
-  low_prod = vec_mulo((__vector short)a1, (__vector short)b1);
-  high_prod = vec_msum((__vector short)a1, (__vector short)bswap, v0i);
-  high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
+  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) ei_p4i_MINUS16 );
+  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
+  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, ei_p4i_ZERO);
+  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) ei_p4i_MINUS16);
   prod = vec_add( low_prod, high_prod );
 
   // NOR the product and select only the negative elements according to the sign mask
   prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(v0i, prod_, sgn);
+  prod_ = vec_sel(ei_p4i_ZERO, prod_, sgn);
 
   // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(v0i, v1i, sgn);
+  v1sel = vec_sel(ei_p4i_ZERO, ei_p4i_ONE, sgn);
   prod_ = vec_add(prod_, v1sel);
 
   // Merge the results back to the final vector.
@@ -152,20 +206,18 @@ template<> inline v4i  ei_pmul(const v4i&   a, const v4i&   b)
 
   return prod;
 }
-
-template<> inline v4f  ei_pdiv(const v4f&   a, const v4f&   b) {
-  v4f t, y_0, y_1, res;
-  USE_CONST_v0f;
-  USE_CONST_v1f;
+template<> EIGEN_STRONG_INLINE Packet4f ei_pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+  Packet4f t, y_0, y_1, res;
 
   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
   y_0 = vec_re(b);
 
   // Do one Newton-Raphson iteration to get the needed accuracy
-  t = vec_nmsub(y_0, b, v1f);
+  t   = vec_nmsub(y_0, b, ei_p4f_ONE);
   y_1 = vec_madd(y_0, t, y_0);
 
-  res = vec_madd(a, y_1, v0f);
+  res = vec_madd(a, y_1, ei_p4f_ZERO);
   return res;
 }
 
@@ -174,265 +226,246 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pdiv<Packet4i>(const Packet4i& /*a*/,
   return ei_pset1<int>(0);
 }
 
-template<> inline v4f  ei_pmadd(const v4f&  a, const v4f&   b, const v4f&  c) { return vec_madd(a, b, c); }
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return ei_padd(ei_pmul(a,b), c); }
 
-template<> inline v4f  ei_pmin(const v4f&   a, const v4f&   b) { return vec_min(a,b); }
-template<> inline v4i  ei_pmin(const v4i&   a, const v4i&   b) { return vec_min(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
 
-template<> inline v4f  ei_pmax(const v4f&   a, const v4f&   b) { return vec_max(a,b); }
-template<> inline v4i  ei_pmax(const v4i&   a, const v4i&   b) { return vec_max(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 
-template<> EIGEN_STRONG_INLINE v4f ei_pabs(const v4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE v4i ei_pabs(const v4i& a) { return vec_abs(a); }
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet4f ei_pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 
-template<> inline v4f  ei_pload(const float* from) { return vec_ld(0, from); }
-template<> inline v4i  ei_pload(const int*   from) { return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
 
-template<> inline v4f  ei_ploadu(const float*  from)
+template<> EIGEN_STRONG_INLINE Packet4f ei_pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 {
+  EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  __vector unsigned char MSQ, LSQ;
-  __vector unsigned char mask;
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
   mask = vec_lvsl(0, from);                        // create the permute mask
-  return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
-}
+  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
 
-template<> inline v4i  ei_ploadu(const int*    from)
+}
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 {
+  EIGEN_DEBUG_ALIGNED_LOAD
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  __vector unsigned char MSQ, LSQ;
-  __vector unsigned char mask;
+  Packet16uc MSQ, LSQ;
+  Packet16uc mask;
   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
   LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
   mask = vec_lvsl(0, from);                        // create the permute mask
-  return (v4i) vec_perm(MSQ, LSQ, mask);    // align the data
+  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
 
-template<> inline v4f  ei_pset1(const float&  from)
-{
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float __attribute__(aligned(16)) af[4];
-  af[0] = from;
-  v4f vc = vec_ld(0, af);
-  vc = vec_splat(vc, 0);
-  return vc;
-}
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
 
-template<> inline v4i  ei_pset1(const int&    from)
-{
-  int __attribute__(aligned(16)) ai[4];
-  ai[0] = from;
-  v4i vc = vec_ld(0, ai);
-  vc = vec_splat(vc, 0);
-  return vc;
-}
-
-template<> inline void ei_pstore(float*   to, const v4f&   from) { vec_st(from, 0, to); }
-template<> inline void ei_pstore(int*     to, const v4i&   from) { vec_st(from, 0, to); }
-
-template<> inline void ei_pstoreu(float*  to, const v4f&   from)
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from)
 {
+  EIGEN_DEBUG_UNALIGNED_STORE
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   // Warning: not thread safe!
-  __vector unsigned char MSQ, LSQ, edges;
-  __vector unsigned char edgeAlign, align;
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
 
   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
   edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
   align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
-  LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
+  MSQ = vec_perm(edges,(Packet16uc)from,align);   // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc)from,edges,align);   // misalign the data (LSQ)
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
-
-template<> inline void ei_pstoreu(int*    to , const v4i&    from )
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from)
 {
+  EIGEN_DEBUG_UNALIGNED_STORE
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   // Warning: not thread safe!
-  __vector unsigned char MSQ, LSQ, edges;
-  __vector unsigned char edgeAlign, align;
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
 
   MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
   LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
   edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
+  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
   align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges,(__vector unsigned char)from,align);   // misalign the data (MSQ)
-  LSQ = vec_perm((__vector unsigned char)from,edges,align);   // misalign the data (LSQ)
+  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
 
-template<> inline float  ei_pfirst(const v4f&  a)
+template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, ei_p16uc_REVERSE); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, ei_p16uc_REVERSE); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ei_pabs(const Packet4f& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pabs(const Packet4i& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE float ei_predux<Packet4f>(const Packet4f& a)
 {
-  float EIGEN_ALIGN16 af[4];
-  vec_st(a, 0, af);
-  return af[0];
-}
-
-template<> inline int    ei_pfirst(const v4i&  a)
-{
-  int EIGEN_ALIGN16 ai[4];
-  vec_st(a, 0, ai);
-  return ai[0];
-}
-
-template<> EIGEN_STRONG_INLINE v4f ei_preverse(const v4f& a)
-{
-  static const __vector unsigned char reverse_mask =
-    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-  return (v4f)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
-}
-template<> EIGEN_STRONG_INLINE v4i ei_preverse(const v4i& a)
-{
-  static const __vector unsigned char __attribute__(aligned(16)) reverse_mask =
-    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-  return (v4i)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
-}
-
-inline v4f ei_preduxp(const v4f* vecs)
-{
-  v4f v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
-  return sum[0];
-}
-
-inline v4i  ei_preduxp(const v4i* vecs)
-{
-  v4i v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
-  return sum[0];
-}
-
-inline float ei_predux(const v4f& a)
-{
-  v4f b, sum;
-  b = (v4f)vec_sld(a, a, 8);
+  Packet4f b, sum;
+  b   = (Packet4f) vec_sld(a, a, 8);
   sum = vec_add(a, b);
-  b = (v4f)vec_sld(sum, sum, 4);
+  b   = (Packet4f) vec_sld(sum, sum, 4);
   sum = vec_add(sum, b);
   return ei_pfirst(sum);
 }
 
-inline int ei_predux(const v4i& a)
+template<> EIGEN_STRONG_INLINE Packet4f ei_preduxp<Packet4f>(const Packet4f* vecs)
 {
-  USE_CONST_v0i;
-  v4i sum;
-  sum = vec_sums(a, v0i);
-  sum = vec_sld(sum, v0i, 12);
+  Packet4f v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = vec_add(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = vec_add(sum[2], sum[3]);
+  // Add the results
+  sum[0] = vec_add(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+template<> EIGEN_STRONG_INLINE int ei_predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i sum;
+  sum = vec_sums(a, ei_p4i_ZERO);
+  sum = vec_sld(sum, ei_p4i_ZERO, 12);
   return ei_pfirst(sum);
 }
 
-// implement other reductions operators
-inline float ei_predux_mul(const v4f& a)
+template<> EIGEN_STRONG_INLINE Packet4i ei_preduxp<Packet4i>(const Packet4i* vecs)
 {
-  v4f prod;
-  prod = ei_pmul(a, (v4f)vec_sld(a, a, 8));
-  return ei_pfirst(ei_pmul(prod, (v4f)vec_sld(prod, prod, 4)));
+  Packet4i v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = vec_add(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = vec_add(sum[2], sum[3]);
+  // Add the results
+  sum[0] = vec_add(sum[0], sum[1]);
+
+  return sum[0];
 }
 
-inline int ei_predux_mul(const v4i& a)
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE float ei_predux_mul<Packet4f>(const Packet4f& a)
+{
+  Packet4f prod;
+  prod = ei_pmul(a, (Packet4f)vec_sld(a, a, 8));
+  return ei_pfirst(ei_pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+}
+
+template<> EIGEN_STRONG_INLINE int ei_predux_mul<Packet4i>(const Packet4i& a)
 {
   EIGEN_ALIGN16 int aux[4];
   ei_pstore(aux, a);
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
-inline float ei_predux_min(const v4f& a)
+// min
+template<> EIGEN_STRONG_INLINE float ei_predux_min<Packet4f>(const Packet4f& a)
 {
-  v4f b, res;
+  Packet4f b, res;
   b = vec_min(a, vec_sld(a, a, 8));
   res = vec_min(b, vec_sld(b, b, 4));
   return ei_pfirst(res);
 }
 
-inline int ei_predux_min(const v4i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux_min<Packet4i>(const Packet4i& a)
 {
-  v4i b, res;
+  Packet4i b, res;
   b = vec_min(a, vec_sld(a, a, 8));
   res = vec_min(b, vec_sld(b, b, 4));
   return ei_pfirst(res);
 }
 
-inline float ei_predux_max(const v4f& a)
+// max
+template<> EIGEN_STRONG_INLINE float ei_predux_max<Packet4f>(const Packet4f& a)
 {
-  v4f b, res;
+  Packet4f b, res;
   b = vec_max(a, vec_sld(a, a, 8));
   res = vec_max(b, vec_sld(b, b, 4));
   return ei_pfirst(res);
 }
 
-inline int ei_predux_max(const v4i& a)
+template<> EIGEN_STRONG_INLINE int ei_predux_max<Packet4i>(const Packet4i& a)
 {
-  v4i b, res;
+  Packet4i b, res;
   b = vec_max(a, vec_sld(a, a, 8));
   res = vec_max(b, vec_sld(b, b, 4));
   return ei_pfirst(res);
 }
 
-
-
 template<int Offset>
-struct ei_palign_impl<Offset, v4f>
+struct ei_palign_impl<Offset,Packet4f>
 {
-  inline static void run(v4f& first, const v4f& second)
+  EIGEN_STRONG_INLINE static void run(Packet4f& first, const Packet4f& second)
   {
-    first = vec_sld(first, second, Offset*4);
+    if (Offset!=0)
+      first = vec_sld(first, second, Offset*4);
   }
 };
 
 template<int Offset>
-struct ei_palign_impl<Offset, v4i>
+struct ei_palign_impl<Offset,Packet4i>
 {
-  inline static void run(v4i& first, const v4i& second)
+  EIGEN_STRONG_INLINE static void run(Packet4i& first, const Packet4i& second)
   {
-    first = vec_sld(first, second, Offset*4);
+    if (Offset!=0)
+      first = vec_sld(first, second, Offset*4);
   }
 };
-
 #endif // EIGEN_PACKET_MATH_ALTIVEC_H

From afd7ee759b4b332391f3c1a865f46eed0d3e32ea Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 5 Mar 2010 21:35:11 +0100
Subject: [PATCH 108/122] fix copy pasted comment

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index f5fbeb5d8..c6fc670d8 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -37,8 +37,7 @@
 #define EIGEN_TUNE_FOR_CPU_CACHE_SIZE 8*128*128
 #endif
 
-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
+// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
 #endif

From f03d95348d6b1b283b772ceb9988f5cb230e63ea Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 6 Mar 2010 02:17:37 -0500
Subject: [PATCH 109/122] introduce EIGEN_DONT_ALIGN_STACK (disables alignment
 attributes) and EIGEN_DONT_ALIGN_HEAP (disables aligned malloc)... you can
 still use EIGEN_DONT_ALIGN to do both at once.

---
 CMakeLists.txt                  |  8 +++-
 Eigen/src/Core/MatrixStorage.h  |  6 ---
 Eigen/src/Core/util/Macros.h    | 53 +++++++++++++++-------
 Eigen/src/Core/util/Memory.h    |  8 ++--
 Eigen/src/Core/util/XprHelper.h | 11 +++--
 cmake/EigenTesting.cmake        | 80 +++++++++++++++++----------------
 test/dynalloc.cpp               |  2 +-
 test/unalignedassert.cpp        |  6 +--
 8 files changed, 100 insertions(+), 74 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e9c4533d..a85bbf222 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,7 +140,13 @@ option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in t
 if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
   add_definitions(-DEIGEN_DONT_VECTORIZE=1)
   message("Disabling vectorization in tests/examples")
-endif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
+endif()
+
+option(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT "Disable explicit alignment (hence vectorization) in tests/examples" OFF)
+if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
+  add_definitions(-DEIGEN_DONT_ALIGN=1)
+  message("Disabling alignment in tests/examples")
+endif()
 
 option(EIGEN_TEST_C++0x "Enables all C++0x features." OFF)
 
diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h
index ece603ffa..3303b2663 100644
--- a/Eigen/src/Core/MatrixStorage.h
+++ b/Eigen/src/Core/MatrixStorage.h
@@ -50,12 +50,6 @@ struct ei_matrix_array
   ei_matrix_array(ei_constructor_without_unaligned_array_assert) {}
 };
 
-// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly
-#ifdef __ARM_NEON__
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
-#endif
 #ifdef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #else
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7968d6604..7236b42f2 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -36,13 +36,17 @@
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable 16 byte alignment on all
-// platforms where vectorization might be enabled. In theory we could always enable alignment, but it can be a cause of problems
-// on some platforms, so we just disable it in certain common platform (compiler+architecture combinations) to avoid these problems.
-#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__) || defined(__ARM_NEON__))
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 1
+// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+// certain common platform (compiler+architecture combinations) to avoid these problems.
+// Only stack alignment is really problematic (relies on nonstandard compiler extensions that don't
+// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
+// when we have to disable stack alignment.
+#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
 #else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT 0
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
 #endif
 
 #if defined(__GNUC__) && (__GNUC__ <= 3)
@@ -51,27 +55,42 @@
 #define EIGEN_GCC3_OR_OLDER 0
 #endif
 
-// FIXME vectorization + alignment is completely disabled with sun studio
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_ALIGNMENT && !EIGEN_GCC3_OR_OLDER && !defined(__SUNPRO_CC)
-  #define EIGEN_ARCH_WANTS_ALIGNMENT 1
+// FIXME vectorization + stack alignment is completely disabled with sun studio
+#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_GCC3_OR_OLDER && !defined(__SUNPRO_CC)
+  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
 #else
-  #define EIGEN_ARCH_WANTS_ALIGNMENT 0
+  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
 #endif
 
-// EIGEN_ALIGN is the true test whether we want to align or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN) and the architecture config (EIGEN_ARCH_WANTS_ALIGNMENT). Henceforth, only EIGEN_ALIGN should be used.
-#if EIGEN_ARCH_WANTS_ALIGNMENT && !defined(EIGEN_DONT_ALIGN)
-  #define EIGEN_ALIGN 1
+#ifdef EIGEN_DONT_ALIGN
+  #ifndef EIGEN_DONT_ALIGN_STACK
+    #define EIGEN_DONT_ALIGN_STACK
+  #endif
+  #ifndef EIGEN_DONT_ALIGN_HEAP
+    #define EIGEN_DONT_ALIGN_HEAP
+  #endif
+#endif
+
+// EIGEN_ALIGN_STACK is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
+// alignment (EIGEN_DONT_ALIGN_STACK) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STACK should be used.
+#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STACK)
+  #define EIGEN_ALIGN_STACK 1
 #else
-  #define EIGEN_ALIGN 0
+  #define EIGEN_ALIGN_STACK 0
   #ifdef EIGEN_VECTORIZE
-    #error "Vectorization enabled, but our platform checks say that we don't do 16 byte alignment on this platform. If you added vectorization for another architecture, you also need to edit this platform check."
+    #error "Vectorization enabled, but our platform checks say that we don't do 16 byte stack alignment on this platform. If you added vectorization for another architecture, you also need to edit this platform check."
   #endif
   #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
     #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
   #endif
 #endif
 
+#ifndef EIGEN_DONT_ALIGN_HEAP
+  #define EIGEN_ALIGN_HEAP 1
+#else
+  #define EIGEN_ALIGN_HEAP 0
+#endif
+
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
 #else
@@ -185,7 +204,7 @@ using Eigen::ei_cos;
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if !EIGEN_ALIGN
+#if !EIGEN_ALIGN_STACK
   #define EIGEN_ALIGN_TO_BOUNDARY(n)
 #elif (defined __GNUC__) || (defined __PGI)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index aa0073d44..9442dffb6 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -172,7 +172,7 @@ inline void* ei_aligned_malloc(size_t size)
   #endif
 
   void *result;
-  #if !EIGEN_ALIGN
+  #if !EIGEN_ALIGN_HEAP
     result = std::malloc(size);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     result = std::malloc(size);
@@ -196,7 +196,7 @@ inline void* ei_aligned_malloc(size_t size)
 /** \internal Frees memory allocated with ei_aligned_malloc. */
 inline void ei_aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN
+  #if !EIGEN_ALIGN_HEAP
     std::free(ptr);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     std::free(ptr);
@@ -221,7 +221,7 @@ inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   (void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
 
   void *result;
-#if !EIGEN_ALIGN
+#if !EIGEN_ALIGN_HEAP
   result = std::realloc(ptr,new_size);
 #elif EIGEN_MALLOC_ALREADY_ALIGNED
   result = std::realloc(ptr,new_size);
@@ -443,7 +443,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_ALIGN
+#if EIGEN_ALIGN_HEAP
   #ifdef EIGEN_EXCEPTIONS
     #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
       void* operator new(size_t size, const std::nothrow_t&) throw() { \
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index eff055b04..b4f72b62c 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -88,15 +88,20 @@ class ei_compute_matrix_flags
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-#if !defined(__ARM_NEON__)
+#if EIGEN_ALIGN_STACK
       is_fixed_size_aligned
         = (!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
 #else
-// FIXME!!! This is a hack because ARM gcc does not honour __attribute__((aligned(16))) properly
       is_fixed_size_aligned = 0,
 #endif
+#if EIGEN_ALIGN_HEAP
+      is_dynamic_size_aligned = is_dynamic_size_storage,
+#else
+      is_dynamic_size_aligned = 0,
+#endif
+
       aligned_bit = (((Options&DontAlign)==0)
-        && (is_dynamic_size_storage || is_fixed_size_aligned))
+        && (is_dynamic_size_aligned || is_fixed_size_aligned))
         ? AlignedBit : 0,
       packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
     };
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index b08f8c340..3bb9aed2b 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -154,53 +154,55 @@ macro(ei_testing_print_summary)
     message("Default order:     Column-major")
   endif()
 
-  if(EIGEN_TEST_SSE2)
-    message("SSE2:              ON")
+  if(EIGEN_TEST_NO_EXPLICIT_ALIGNMENT)
+    message("Explicit alignment (hence vectorization) disabled")
+  elseif(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
+    message("Explicit vectorization disabled (alignment kept enabled)")
   else()
-    message("SSE2:              Using architecture defaults")
-  endif()
 
-  if(EIGEN_TEST_SSE3)
-    message("SSE3:              ON")
-  else()
-    message("SSE3:              Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_SSE2)
+      message("SSE2:              ON")
+    else()
+      message("SSE2:              Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_SSSE3)
-    message("SSSE3:             ON")
-  else()
-    message("SSSE3:             Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_SSE3)
+      message("SSE3:              ON")
+    else()
+      message("SSE3:              Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_SSE4_1)
-    message("SSE4.1:            ON")
-  else()
-    message("SSE4.1:            Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_SSSE3)
+      message("SSSE3:             ON")
+    else()
+      message("SSSE3:             Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_SSE4_2)
-    message("SSE4.2:            ON")
-  else()
-    message("SSE4.2:            Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_SSE4_1)
+      message("SSE4.1:            ON")
+    else()
+      message("SSE4.1:            Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_ALTIVEC)
-    message("Altivec:           ON")
-  else()
-    message("Altivec:           Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_SSE4_2)
+      message("SSE4.2:            ON")
+    else()
+      message("SSE4.2:            Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_NEON)
-    message("ARM NEON:          ON")
-  else()
-    message("ARM NEON:          Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_ALTIVEC)
+      message("Altivec:           ON")
+    else()
+      message("Altivec:           Using architecture defaults")
+    endif()
 
-  if(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION)
-    message("Explicit vec:      OFF")
-  else()
-    message("Explicit vec:      Using architecture defaults")
-  endif()
+    if(EIGEN_TEST_NEON)
+      message("ARM NEON:          ON")
+    else()
+      message("ARM NEON:          Using architecture defaults")
+    endif()
+
+  endif() # vectorization / alignment options
 
   message("\n${EIGEN_TESTING_SUMMARY}")
   #   message("CXX:               ${CMAKE_CXX_COMPILER}")
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index e0a9f9f86..85a39ec83 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -24,7 +24,7 @@
 
 #include "main.h"
 
-#if EIGEN_ALIGN
+#if EIGEN_ALIGN_HEAP
 #define ALIGNMENT 16
 #else
 #define ALIGNMENT 1
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index 85a83b7b5..497c5a5aa 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -78,7 +78,7 @@ void check_unalignedassert_good()
   delete[] y;
 }
 
-#if EIGEN_ALIGN
+#if EIGEN_ALIGN_STACK
 template<typename T>
 void construct_at_boundary(int boundary)
 {
@@ -94,7 +94,7 @@ void construct_at_boundary(int boundary)
 
 void unalignedassert()
 {
-  #if EIGEN_ALIGN
+  #if EIGEN_ALIGN_STACK
   construct_at_boundary<Vector2f>(4);
   construct_at_boundary<Vector3f>(4);
   construct_at_boundary<Vector4f>(16);
@@ -124,7 +124,7 @@ void unalignedassert()
   check_unalignedassert_good<TestNew6>();
   check_unalignedassert_good<Depends<true> >();
 
-#if EIGEN_ALIGN
+#if EIGEN_ALIGN_STACK
   VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
   VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(8));
   VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));

From 2bd31d3fbccbb2ab325c2ce86da1b0a948ff0281 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 6 Mar 2010 09:05:15 -0500
Subject: [PATCH 110/122] * include Macros.h much earlier: since it takes care
 of the alignment platform detection, it is needed before we do the
 vectorization stuff in Eigen/Core !! * kill EIGEN_DONT_ALIGN_HEAP option (one
 should use EIGEN_DONT_ALIGN) * rename EIGEN_DONT_ALIGN_STACK to
 EIGEN_DONT_ALIGN_STATICALLY. hope it's a better name.

---
 Eigen/Core                       | 36 ++++++++++++++-----------
 Eigen/src/Core/util/Macros.h     | 45 ++++++++++++++++----------------
 Eigen/src/Core/util/Memory.h     |  8 +++---
 Eigen/src/Core/util/XprHelper.h  | 27 +++++++++++++++----
 doc/D11_UnalignedArrayAssert.dox |  7 +++--
 test/dynalloc.cpp                |  2 +-
 test/unalignedassert.cpp         |  6 ++---
 7 files changed, 78 insertions(+), 53 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index 075f95e5a..ce27d10b0 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -29,6 +29,26 @@
 // first thing Eigen does: prevent MSVC from committing suicide
 #include "src/Core/util/DisableMSVCWarnings.h"
 
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the alignment settings (platform detection and honoring the user's will if he
+// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+#include "src/Core/util/Macros.h"
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
+// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
+#if !EIGEN_ALIGN
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
+// disable vectorization on LLVM: it's not yet ready for that.
+#ifdef __clang__
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
+
 #ifdef _MSC_VER
   #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
   #if (_MSC_VER >= 1500) // 2008 or later
@@ -40,26 +60,13 @@
   #endif
 #endif
 
-#ifdef __GNUC__
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__>=x && __GNUC_MINOR__>=y) || __GNUC__>x)
-#else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
-#endif
-
 // Remember that usage of defined() in a #define is undefined by the standard
 #if (defined __SSE2__) && ( (!defined __GNUC__) || EIGEN_GNUC_AT_LEAST(4,2) )
   #define EIGEN_SSE2_BUT_NOT_OLD_GCC
 #endif
 
-#ifdef EIGEN_DONT_ALIGN
-  #define EIGEN_DONT_VECTORIZE
-#endif
-
-#ifdef __clang__
-#define EIGEN_DONT_VECTORIZE
-#endif
-
 #ifndef EIGEN_DONT_VECTORIZE
+
   #if defined (EIGEN_SSE2_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
 
     // Defines symbols for compile-time detection of which instructions are
@@ -201,7 +208,6 @@ using std::size_t;
 /** The type used to identify a dense storage. */
 struct Dense {};
 
-#include "src/Core/util/Macros.h"
 #include "src/Core/util/Constants.h"
 #include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 7236b42f2..df4ee5b35 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -35,6 +35,17 @@
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
+#ifdef __GNUC__
+  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__>=x && __GNUC_MINOR__>=y) || __GNUC__>x)
+#else
+  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ <= 3)
+#define EIGEN_GCC3_OR_OLDER 1
+#else
+#define EIGEN_GCC3_OR_OLDER 0
+#endif
 
 // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
 // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
@@ -49,12 +60,6 @@
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ <= 3)
-#define EIGEN_GCC3_OR_OLDER 1
-#else
-#define EIGEN_GCC3_OR_OLDER 0
-#endif
-
 // FIXME vectorization + stack alignment is completely disabled with sun studio
 #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_GCC3_OR_OLDER && !defined(__SUNPRO_CC)
   #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
@@ -63,20 +68,20 @@
 #endif
 
 #ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STACK
-    #define EIGEN_DONT_ALIGN_STACK
-  #endif
-  #ifndef EIGEN_DONT_ALIGN_HEAP
-    #define EIGEN_DONT_ALIGN_HEAP
+  #ifndef EIGEN_DONT_ALIGN_STATICALLY
+    #define EIGEN_DONT_ALIGN_STATICALLY
   #endif
+  #define EIGEN_ALIGN 0
+#else
+  #define EIGEN_ALIGN 1
 #endif
 
-// EIGEN_ALIGN_STACK is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STACK) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STACK should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STACK)
-  #define EIGEN_ALIGN_STACK 1
+// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
+// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
+#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
+  #define EIGEN_ALIGN_STATICALLY 1
 #else
-  #define EIGEN_ALIGN_STACK 0
+  #define EIGEN_ALIGN_STATICALLY 0
   #ifdef EIGEN_VECTORIZE
     #error "Vectorization enabled, but our platform checks say that we don't do 16 byte stack alignment on this platform. If you added vectorization for another architecture, you also need to edit this platform check."
   #endif
@@ -85,12 +90,6 @@
   #endif
 #endif
 
-#ifndef EIGEN_DONT_ALIGN_HEAP
-  #define EIGEN_ALIGN_HEAP 1
-#else
-  #define EIGEN_ALIGN_HEAP 0
-#endif
-
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
 #define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
 #else
@@ -204,7 +203,7 @@ using Eigen::ei_cos;
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if !EIGEN_ALIGN_STACK
+#if !EIGEN_ALIGN_STATICALLY
   #define EIGEN_ALIGN_TO_BOUNDARY(n)
 #elif (defined __GNUC__) || (defined __PGI)
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 9442dffb6..aa0073d44 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -172,7 +172,7 @@ inline void* ei_aligned_malloc(size_t size)
   #endif
 
   void *result;
-  #if !EIGEN_ALIGN_HEAP
+  #if !EIGEN_ALIGN
     result = std::malloc(size);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     result = std::malloc(size);
@@ -196,7 +196,7 @@ inline void* ei_aligned_malloc(size_t size)
 /** \internal Frees memory allocated with ei_aligned_malloc. */
 inline void ei_aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN_HEAP
+  #if !EIGEN_ALIGN
     std::free(ptr);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     std::free(ptr);
@@ -221,7 +221,7 @@ inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   (void)old_size; // Suppress 'unused variable' warning. Seen in boost tee.
 
   void *result;
-#if !EIGEN_ALIGN_HEAP
+#if !EIGEN_ALIGN
   result = std::realloc(ptr,new_size);
 #elif EIGEN_MALLOC_ALREADY_ALIGNED
   result = std::realloc(ptr,new_size);
@@ -443,7 +443,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_ALIGN_HEAP
+#if EIGEN_ALIGN
   #ifdef EIGEN_EXCEPTIONS
     #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
       void* operator new(size_t size, const std::nothrow_t&) throw() { \
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index b4f72b62c..4259ebb92 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -88,21 +88,38 @@ class ei_compute_matrix_flags
     enum {
       row_major_bit = Options&RowMajor ? RowMajorBit : 0,
       is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-#if EIGEN_ALIGN_STACK
+#if EIGEN_ALIGN_STATICALLY
       is_fixed_size_aligned
         = (!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0),
 #else
       is_fixed_size_aligned = 0,
 #endif
-#if EIGEN_ALIGN_HEAP
+#if EIGEN_ALIGN
       is_dynamic_size_aligned = is_dynamic_size_storage,
 #else
       is_dynamic_size_aligned = 0,
 #endif
 
-      aligned_bit = (((Options&DontAlign)==0)
-        && (is_dynamic_size_aligned || is_fixed_size_aligned))
-        ? AlignedBit : 0,
+      aligned_bit =
+      (
+        ((Options&DontAlign)==0)
+        && (
+#if EIGEN_ALIGN_STATICALLY
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows) % ei_packet_traits<Scalar>::size) == 0))
+#else
+             0
+#endif
+
+          ||
+
+#if EIGEN_ALIGN
+             is_dynamic_size_storage
+#else
+             0
+#endif
+
+          )
+      ) ? AlignedBit : 0,
       packet_access_bit = ei_packet_traits<Scalar>::size > 1 && aligned_bit ? PacketAccessBit : 0
     };
 
diff --git a/doc/D11_UnalignedArrayAssert.dox b/doc/D11_UnalignedArrayAssert.dox
index e9fb2a69f..23ab3a94d 100644
--- a/doc/D11_UnalignedArrayAssert.dox
+++ b/doc/D11_UnalignedArrayAssert.dox
@@ -107,8 +107,11 @@ However there are a few corner cases where these alignment settings get overridd
 
 Two possibilities:
 <ul>
-  <li>Define EIGEN_DONT_ALIGN. That disables all 128-bit alignment code, and in particular everything vectorization-related. But do note that this in particular breaks ABI compatibility with vectorized code.</li>
-  <li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the 128-bit alignment code and thus preserves ABI compatibility.</li>
+  <li>Define EIGEN_DONT_ALIGN_STATICALLY. That disables all 128-bit static alignment code, while keeping 128-bit heap alignment. This has the effect of
+      disabling vectorization for fixed-size objects (like Matrix4d) while keeping vectorization of dynamic-size objects
+      (like MatrixXd). But do note that this breaks ABI compatibility with the default behavior of 128-bit static alignment.</li>
+  <li>Or define both EIGEN_DONT_VECTORIZE and EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT. This keeps the
+      128-bit alignment code and thus preserves ABI compatibility, but completely disables vectorization.</li>
 </ul>
 
 For more information, see <a href="http://eigen.tuxfamily.org/index.php?title=FAQ#I_disabled_vectorization.2C_but_I.27m_still_getting_annoyed_about_alignment_issues.21">this FAQ</a>.
diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp
index 85a39ec83..e0a9f9f86 100644
--- a/test/dynalloc.cpp
+++ b/test/dynalloc.cpp
@@ -24,7 +24,7 @@
 
 #include "main.h"
 
-#if EIGEN_ALIGN_HEAP
+#if EIGEN_ALIGN
 #define ALIGNMENT 16
 #else
 #define ALIGNMENT 1
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
index 497c5a5aa..5782ab20e 100644
--- a/test/unalignedassert.cpp
+++ b/test/unalignedassert.cpp
@@ -78,7 +78,7 @@ void check_unalignedassert_good()
   delete[] y;
 }
 
-#if EIGEN_ALIGN_STACK
+#if EIGEN_ALIGN_STATICALLY
 template<typename T>
 void construct_at_boundary(int boundary)
 {
@@ -94,7 +94,7 @@ void construct_at_boundary(int boundary)
 
 void unalignedassert()
 {
-  #if EIGEN_ALIGN_STACK
+  #if EIGEN_ALIGN_STATICALLY
   construct_at_boundary<Vector2f>(4);
   construct_at_boundary<Vector3f>(4);
   construct_at_boundary<Vector4f>(16);
@@ -124,7 +124,7 @@ void unalignedassert()
   check_unalignedassert_good<TestNew6>();
   check_unalignedassert_good<Depends<true> >();
 
-#if EIGEN_ALIGN_STACK
+#if EIGEN_ALIGN_STATICALLY
   VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
   VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(8));
   VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));

From bf0a21a6955c5789fc381c0c2a628838c354ce4b Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 6 Mar 2010 09:28:58 -0500
Subject: [PATCH 111/122] * disable static alignment on QCC * remove obsolete
 #error

---
 Eigen/src/Core/util/Macros.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index df4ee5b35..77da6884f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -51,17 +51,20 @@
 // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
 // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
 // certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only stack alignment is really problematic (relies on nonstandard compiler extensions that don't
+// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
 // work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable stack alignment.
+// when we have to disable static alignment.
 #if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
 #else
 #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
 #endif
 
-// FIXME vectorization + stack alignment is completely disabled with sun studio
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_GCC3_OR_OLDER && !defined(__SUNPRO_CC)
+// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+ && !EIGEN_GCC3_OR_OLDER \
+ && !defined(__SUNPRO_CC) \
+ && !defined(__QNXNTO__)
   #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
 #else
   #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
@@ -82,9 +85,6 @@
   #define EIGEN_ALIGN_STATICALLY 1
 #else
   #define EIGEN_ALIGN_STATICALLY 0
-  #ifdef EIGEN_VECTORIZE
-    #error "Vectorization enabled, but our platform checks say that we don't do 16 byte stack alignment on this platform. If you added vectorization for another architecture, you also need to edit this platform check."
-  #endif
   #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
     #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
   #endif

From 7e2afe7e9599e7f77be3739175aaff848a3f9708 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 6 Mar 2010 12:11:08 -0500
Subject: [PATCH 112/122] remove the __ARM_NEON__ check there since
 Konstantinos said he removed it but apparently didn't commit :)

---
 Eigen/src/Core/util/Memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index aa0073d44..1cd231329 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -424,7 +424,7 @@ inline static Integer ei_first_aligned(const Scalar* array, Integer size)
   * ei_aligned_stack_free(data,float,array.size());
   * \endcode
   */
-#if (defined __linux__) && !(defined __ARM_NEON__)
+#if (defined __linux__)
   #define ei_aligned_stack_alloc(SIZE) (SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) \
                                     ? alloca(SIZE) \
                                     : ei_aligned_malloc(SIZE)

From c4f8afdf496779e2ca8862614a48a1c3518a223e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Sat, 6 Mar 2010 14:44:57 -0500
Subject: [PATCH 113/122] #undef minor at the right place

---
 Eigen/Core                   | 3 +++
 Eigen/src/Core/util/Macros.h | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Eigen/Core b/Eigen/Core
index ce27d10b0..f124aff09 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -169,6 +169,9 @@
 // defined in bits/termios.h
 #undef B0
 
+// defined in some GNU standard header
+#undef minor
+
 namespace Eigen {
 
 inline static const char *SimdInstructionSetsInUse(void) {
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 77da6884f..94c15f0a7 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -26,8 +26,6 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
-#undef minor
-
 #define EIGEN_WORLD_VERSION 2
 #define EIGEN_MAJOR_VERSION 91
 #define EIGEN_MINOR_VERSION 0

From 271fc84e4786e19316422c075e4317d3b2e4e972 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 20:52:20 +0100
Subject: [PATCH 114/122] bugfix in gebp for 32bits x86

---
 Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 8ac5afb05..5e219e077 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -478,7 +478,7 @@ struct ei_gebp_kernel
           {
             Scalar B0, T0, A0;
 
-            A0 = blA[0*PacketSize];
+            A0 = blA[k];
             B0 = blB[0*PacketSize];
             CJMADD(A0,B0,C0,T0);
             B0 = blB[1*PacketSize];

From 6f0b96dcf43aaa50edbe1894aeeaab2749c5ab96 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 21:16:43 +0100
Subject: [PATCH 115/122] fix issue #100 (fix syrk)

---
 Eigen/src/Core/products/SelfadjointProduct.h | 36 ++++++++++++--------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index 8967f62be..b10b009e8 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -74,47 +74,51 @@ struct ei_selfadjoint_product<Scalar,MatStorageOrder, ColMajor, AAT, UpLo>
     int mc = std::min<int>(Blocking::Max_mc,size);  // cache block size along the M direction
 
     Scalar* blockA = ei_aligned_stack_new(Scalar, kc*mc);
-    Scalar* blockB = ei_aligned_stack_new(Scalar, kc*size*Blocking::PacketSize);
-
+    std::size_t sizeB = kc*Blocking::PacketSize*Blocking::nr + kc*size;
+    Scalar* allocatedBlockB = ei_aligned_stack_new(Scalar, sizeB);
+    Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
+    
     // note that the actual rhs is the transpose/adjoint of mat
     typedef ei_conj_helper<NumTraits<Scalar>::IsComplex && !AAT, NumTraits<Scalar>::IsComplex && AAT> Conj;
 
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, Conj> gebp_kernel;
+    ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor> pack_rhs;
+    ei_gemm_pack_lhs<Scalar,Blocking::mr,MatStorageOrder, false> pack_lhs;
+    ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo> sybb;
 
     for(int k2=0; k2<depth; k2+=kc)
     {
       const int actual_kc = std::min(k2+kc,depth)-k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      ei_gemm_pack_rhs<Scalar,Blocking::nr,MatStorageOrder==RowMajor ? ColMajor : RowMajor>()
-        (blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
+      pack_rhs(blockB, &mat(0,k2), matStride, alpha, actual_kc, size);
 
       for(int i2=0; i2<size; i2+=mc)
       {
         const int actual_mc = std::min(i2+mc,size)-i2;
 
-        ei_gemm_pack_lhs<Scalar,Blocking::mr,MatStorageOrder, false>()
-          (blockA, &mat(i2, k2), matStride, actual_kc, actual_mc);
+        pack_lhs(blockA, &mat(i2, k2), matStride, actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
         if (UpLo==Lower)
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2));
+          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2),
+                      -1, -1, 0, 0, allocatedBlockB);
 
-        ei_sybb_kernel<Scalar, Blocking::mr, Blocking::nr, Conj, UpLo>()
-          (res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*Blocking::PacketSize*i2, actual_mc, actual_kc);
+        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB);
 
         if (UpLo==Upper)
         {
           int j2 = i2+actual_mc;
-          gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*Blocking::PacketSize*j2, actual_mc, actual_kc, std::max(0,size-j2));
+          gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(0,size-j2),
+                      -1, -1, 0, 0, allocatedBlockB);
         }
       }
     }
     ei_aligned_stack_delete(Scalar, blockA, kc*mc);
-    ei_aligned_stack_delete(Scalar, blockB, kc*size*Blocking::PacketSize);
+    ei_aligned_stack_delete(Scalar, allocatedBlockB, sizeB);
   }
 };
 
@@ -161,7 +165,7 @@ struct ei_sybb_kernel
     PacketSize = ei_packet_traits<Scalar>::size,
     BlockSize  = EIGEN_ENUM_MAX(mr,nr)
   };
-  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth)
+  void operator()(Scalar* res, int resStride, const Scalar* blockA, const Scalar* blockB, int size, int depth, Scalar* workspace)
   {
     ei_gebp_kernel<Scalar, mr, nr, Conj> gebp_kernel;
     Matrix<Scalar,BlockSize,BlockSize,ColMajor> buffer;
@@ -171,7 +175,7 @@ struct ei_sybb_kernel
     for (int j=0; j<size; j+=BlockSize)
     {
       int actualBlockSize = std::min<int>(BlockSize,size - j);
-      const Scalar* actual_b = blockB+j*depth*PacketSize;
+      const Scalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
         gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize);
@@ -181,7 +185,8 @@ struct ei_sybb_kernel
         int i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize);
+        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize,
+                    -1, -1, 0, 0, workspace);
         // 2 - triangular accumulation
         for(int j1=0; j1<actualBlockSize; ++j1)
         {
@@ -195,7 +200,8 @@ struct ei_sybb_kernel
       if(UpLo==Lower)
       {
         int i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize);
+        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize,
+                    -1, -1, 0, 0, workspace);
       }
     }
   }

From a7d199bf9a9c2fb2d924ed012f18d89675f43b79 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 21:37:14 +0100
Subject: [PATCH 116/122] fix trsolve

---
 Eigen/src/Core/products/TriangularSolverMatrix.h | 10 ++++++----
 test/product_symm.cpp                            |  8 +++++---
 test/product_syrk.cpp                            |  4 +++-
 test/product_trsolve.cpp                         | 11 ++++++-----
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index e32a9929c..1774081a2 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -74,6 +74,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
     ei_conj_if<Conjugate> conj;
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<Conjugate,false> > gebp_kernel;
     ei_gemm_pack_lhs<Scalar,Blocking::mr,TriStorageOrder> pack_lhs;
+    ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true> pack_rhs;
 
     for(int k2=IsLower ? 0 : size;
         IsLower ? k2<size : k2>0;
@@ -137,8 +138,7 @@ struct ei_triangular_solve_matrix<Scalar,OnTheLeft,Mode,Conjugate,TriStorageOrde
           int blockBOffset = IsLower ? k1 : lengthTarget;
 
           // update the respective rows of B from other
-          ei_gemm_pack_rhs<Scalar, Blocking::nr, ColMajor, true>()
-            (blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
+          pack_rhs(blockB, _other+startBlock, otherStride, -1, actualPanelWidth, cols, actual_kc, blockBOffset);
 
           // GEBP
           if (lengthTarget>0)
@@ -267,7 +267,8 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
                           blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset); // offsets
+                          panelOffset, panelOffset, // offsets
+                          allocatedBlockB);  // workspace
             }
 
             // unblocked triangular solve
@@ -297,7 +298,8 @@ struct ei_triangular_solve_matrix<Scalar,OnTheRight,Mode,Conjugate,TriStorageOrd
 
         if (rs>0)
           gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
-                      actual_mc, actual_kc, rs);
+                      actual_mc, actual_kc, rs,
+                      -1, -1, 0, 0, allocatedBlockB);
       }
     }
 
diff --git a/test/product_symm.cpp b/test/product_symm.cpp
index a0d80080f..4ff1735d6 100644
--- a/test/product_symm.cpp
+++ b/test/product_symm.cpp
@@ -109,9 +109,11 @@ void test_product_symm()
   for(int i = 0; i < g_repeat ; i++)
   {
     CALL_SUBTEST_1(( symm<float,Dynamic,Dynamic>(ei_random<int>(10,320),ei_random<int>(10,320)) ));
-    CALL_SUBTEST_2(( symm<std::complex<double>,Dynamic,Dynamic>(ei_random<int>(10,320),ei_random<int>(10,320)) ));
+    CALL_SUBTEST_2(( symm<double,Dynamic,Dynamic>(ei_random<int>(10,320),ei_random<int>(10,320)) ));
+    CALL_SUBTEST_3(( symm<std::complex<double>,Dynamic,Dynamic>(ei_random<int>(10,320),ei_random<int>(10,320)) ));
 
-    CALL_SUBTEST_3(( symm<float,Dynamic,1>(ei_random<int>(10,320)) ));
-    CALL_SUBTEST_4(( symm<std::complex<double>,Dynamic,1>(ei_random<int>(10,320)) ));
+    CALL_SUBTEST_4(( symm<float,Dynamic,1>(ei_random<int>(10,320)) ));
+    CALL_SUBTEST_5(( symm<double,Dynamic,1>(ei_random<int>(10,320)) ));
+    CALL_SUBTEST_6(( symm<std::complex<double>,Dynamic,1>(ei_random<int>(10,320)) ));
   }
 }
diff --git a/test/product_syrk.cpp b/test/product_syrk.cpp
index e597ac88a..ec93056a9 100644
--- a/test/product_syrk.cpp
+++ b/test/product_syrk.cpp
@@ -77,6 +77,8 @@ void test_product_syrk()
     s = ei_random<int>(10,320);
     CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
     s = ei_random<int>(10,320);
-    CALL_SUBTEST_2( syrk(MatrixXcd(s, s)) );
+    CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
+    s = ei_random<int>(10,320);
+    CALL_SUBTEST_3( syrk(MatrixXcd(s, s)) );
   }
 }
diff --git a/test/product_trsolve.cpp b/test/product_trsolve.cpp
index 6e916230e..7a8068c35 100644
--- a/test/product_trsolve.cpp
+++ b/test/product_trsolve.cpp
@@ -79,12 +79,13 @@ void test_product_trsolve()
   {
     // matrices
     CALL_SUBTEST_1((trsolve<float,Dynamic,Dynamic>(ei_random<int>(1,320),ei_random<int>(1,320))));
-    CALL_SUBTEST_2((trsolve<std::complex<double>,Dynamic,Dynamic>(ei_random<int>(1,320),ei_random<int>(1,320))));
+    CALL_SUBTEST_2((trsolve<double,Dynamic,Dynamic>(ei_random<int>(1,320),ei_random<int>(1,320))));
+    CALL_SUBTEST_3((trsolve<std::complex<double>,Dynamic,Dynamic>(ei_random<int>(1,320),ei_random<int>(1,320))));
 
     // vectors
-    CALL_SUBTEST_3((trsolve<std::complex<double>,Dynamic,1>(ei_random<int>(1,320))));
-    CALL_SUBTEST_4((trsolve<float,1,1>()));
-    CALL_SUBTEST_5((trsolve<float,1,2>()));
-    CALL_SUBTEST_6((trsolve<std::complex<float>,4,1>()));
+    CALL_SUBTEST_4((trsolve<std::complex<double>,Dynamic,1>(ei_random<int>(1,320))));
+    CALL_SUBTEST_5((trsolve<float,1,1>()));
+    CALL_SUBTEST_6((trsolve<float,1,2>()));
+    CALL_SUBTEST_7((trsolve<std::complex<float>,4,1>()));
   }
 }

From 61ce1de048c3d7365303f5c2b395a92a1b693f3e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 22:15:59 +0100
Subject: [PATCH 117/122] fix symm

---
 .../Core/products/SelfadjointMatrixMatrix.h   | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 2e71b5fd4..280ebe512 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -185,11 +185,13 @@ struct ei_symm_pack_rhs
         count += 1;
       }
 
-      if(half==j2)
+      if(half==j2 && half<k2+rows)
       {
         blockB[count] = alpha*ei_real(rhs(j2,j2));
         count += 1;
       }
+      else
+        half--;
 
       // normal
       for(int k=half+1; k<k2+rows; k++)
@@ -265,6 +267,9 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder> pack_lhs;
+    ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder> pack_rhs;
+    ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(int k2=0; k2<size; k2+=kc)
     {
@@ -273,8 +278,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
       // we have selected one row panel of rhs and one column panel of lhs
       // pack rhs's panel into a sequential chunk of memory
       // and expand each coeff to a constant packet for further reuse
-      ei_gemm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
-        (blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
+      pack_rhs(blockB, &rhs(k2,0), rhsStride, alpha, actual_kc, cols);
 
       // the select lhs's panel has to be split in three different parts:
       //  1 - the transposed panel above the diagonal block => transposed packed copy
@@ -284,8 +288,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
       {
         const int actual_mc = std::min(i2+mc,k2)-i2;
         // transposed packed copy
-        ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder==RowMajor?ColMajor:RowMajor, true>()
-          (blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
 
         gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
       }
@@ -293,8 +296,7 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,true,ConjugateLhs, R
       {
         const int actual_mc = std::min(k2+kc,size)-k2;
         // symmetric packed copy
-        ei_symm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
-          (blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
 
         gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
       }
@@ -346,20 +348,20 @@ struct ei_product_selfadjoint_matrix<Scalar,LhsStorageOrder,false,ConjugateLhs,
     Scalar* blockB = allocatedBlockB + kc*Blocking::PacketSize*Blocking::nr;
 
     ei_gebp_kernel<Scalar, Blocking::mr, Blocking::nr, ei_conj_helper<ConjugateLhs,ConjugateRhs> > gebp_kernel;
+    ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder> pack_lhs;
+    ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder> pack_rhs;
 
     for(int k2=0; k2<size; k2+=kc)
     {
       const int actual_kc = std::min(k2+kc,size)-k2;
 
-      ei_symm_pack_rhs<Scalar,Blocking::nr,RhsStorageOrder>()
-        (blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
+      pack_rhs(blockB, _rhs, rhsStride, alpha, actual_kc, cols, k2);
 
       // => GEPP
       for(int i2=0; i2<rows; i2+=mc)
       {
         const int actual_mc = std::min(i2+mc,rows)-i2;
-        ei_gemm_pack_lhs<Scalar,Blocking::mr,LhsStorageOrder>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
 
         gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols);
       }

From 44020346783a10fde04cce5ab8653815699b477b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 22:24:50 +0100
Subject: [PATCH 118/122] pff I introduced much too many bugs latey, count--

---
 Eigen/src/Core/Product.h | 44 ++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index f814382a9..92ee1043d 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -84,28 +84,28 @@ public:
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
-template<>          struct ei_product_type_selector<Large,Large,1>      { enum { ret = OuterProduct }; };
-template<int Depth> struct ei_product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
-template<>          struct ei_product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
-template<>          struct ei_product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
-template<>          struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
-template<>          struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>          struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
-template<>          struct ei_product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
+template<int M, int N>  struct ei_product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
+template<int Depth>     struct ei_product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
+template<>              struct ei_product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
+template<>              struct ei_product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
+template<>              struct ei_product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
+template<>              struct ei_product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
+template<>              struct ei_product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
+template<>              struct ei_product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
 
 /** \class ProductReturnType
   *

From 1958b7ecccab2ccb2a409265c0f9a763f9432f24 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 22:39:15 +0100
Subject: [PATCH 119/122] stride() => inner/outerStride()

---
 Eigen/src/Core/Product.h                          |  4 ++--
 Eigen/src/Core/SolveTriangular.h                  |  6 +++---
 Eigen/src/Core/products/GeneralMatrixMatrix.h     |  6 +++---
 Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 10 +++++-----
 Eigen/src/Core/products/SelfadjointMatrixVector.h | 12 ++++++------
 Eigen/src/Core/products/SelfadjointProduct.h      |  4 ++--
 Eigen/src/Core/products/SelfadjointRank2Update.h  |  2 +-
 Eigen/src/Core/products/TriangularMatrixMatrix.h  | 10 +++++-----
 Eigen/src/Core/products/TriangularMatrixVector.h  |  4 ++--
 9 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index 92ee1043d..ea3c8b589 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -336,7 +336,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     ei_cache_friendly_product_colmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
       dest.size(),
-      &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
+      &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
       actualRhs, actualDest, actualAlpha);
 
     if (!EvalToDest)
@@ -381,7 +381,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
 
     ei_cache_friendly_product_rowmajor_times_vector
       <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
-        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.stride(),
+        &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
         rhs_data, prod.rhs().size(), dest, actualAlpha);
 
     if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index cac1e2554..fcd9edfa0 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -82,7 +82,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
         VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);
 
         ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
-          &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.stride(),
+          &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
           &(other.coeffRef(startCol)), r,
           target, Scalar(-1));
       }
@@ -147,7 +147,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,ColMajor
         // 2 - it is slighlty faster at runtime
         ei_cache_friendly_product_colmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
           r,
-          &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.stride(),
+          &(actualLhs.const_cast_derived().coeffRef(endBlock,startBlock)), actualLhs.outerStride(),
           other.segment(startBlock, actualPanelWidth),
           &(other.coeffRef(endBlock, 0)),
           Scalar(-1));
@@ -183,7 +183,7 @@ struct ei_triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,StorageOrder,
     const ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
     ei_triangular_solve_matrix<Scalar,Side,Mode,LhsProductTraits::NeedToConjugate,StorageOrder,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(lhs.rows(), Side==OnTheLeft? rhs.cols() : rhs.rows(), &actualLhs.coeff(0,0), actualLhs.stride(), &rhs.coeffRef(0,0), rhs.stride());
+      ::run(lhs.rows(), Side==OnTheLeft? rhs.cols() : rhs.rows(), &actualLhs.coeff(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride());
   }
 };
 
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index c1d42d387..2ab773e64 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -229,9 +229,9 @@ struct ei_gemm_functor
     if(cols==-1)
       cols = m_rhs.cols();
     Gemm::run(rows, cols, m_lhs.cols(),
-              (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.stride(),
-              (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.stride(),
-              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.stride(),
+              (const Scalar*)&(m_lhs.const_cast_derived().coeffRef(row,0)), m_lhs.outerStride(),
+              (const Scalar*)&(m_rhs.const_cast_derived().coeffRef(0,col)), m_rhs.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
               m_actualAlpha,
               info);
   }
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 280ebe512..b23876dc7 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -415,11 +415,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
       ei_traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
       ::run(
-        lhs.rows(), rhs.cols(),           // sizes
-        &lhs.coeff(0,0),    lhs.stride(), // lhs info
-        &rhs.coeff(0,0),    rhs.stride(), // rhs info
-        &dst.coeffRef(0,0), dst.stride(), // result info
-        actualAlpha                       // alpha
+        lhs.rows(), rhs.cols(),                 // sizes
+        &lhs.coeff(0,0),    lhs.outerStride(),  // lhs info
+        &rhs.coeff(0,0),    rhs.outerStride(),  // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        actualAlpha                             // alpha
       );
   }
 };
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 1c48208b3..627c06801 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -185,14 +185,14 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
     Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
                                * RhsBlasTraits::extractScalarFactor(m_rhs);
 
-    ei_assert(dst.stride()==1 && "not implemented yet");
+    ei_assert(dst.innerStride()==1 && "not implemented yet");
     ei_product_selfadjoint_vector<Scalar, (ei_traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
       (
-        lhs.rows(),                       // size
-        &lhs.coeff(0,0),  lhs.stride(),   // lhs info
-        &rhs.coeff(0),    rhs.stride(),   // rhs info
-        &dst.coeffRef(0),                 // result info
-        actualAlpha                       // scale factor
+        lhs.rows(),                           // size
+        &lhs.coeff(0,0),  lhs.innerStride(),  // lhs info
+        &rhs.coeff(0),    rhs.innerStride(),  // rhs info
+        &dst.coeffRef(0),                     // result info
+        actualAlpha                           // scale factor
       );
   }
 };
diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h
index b10b009e8..01cd33d57 100644
--- a/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/Eigen/src/Core/products/SelfadjointProduct.h
@@ -142,8 +142,8 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
     _ActualUType::Flags&RowMajorBit ? RowMajor : ColMajor,
     ei_traits<MatrixType>::Flags&RowMajorBit ? RowMajor : ColMajor,
     !UBlasTraits::NeedToConjugate, UpLo>
-    ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.stride(),
-          const_cast<Scalar*>(_expression().data()), _expression().stride(), actualAlpha);
+    ::run(_expression().cols(), actualU.cols(), &actualU.coeff(0,0), actualU.outerStride(),
+          const_cast<Scalar*>(_expression().data()), _expression().outerStride(), actualAlpha);
 
   return *this;
 }
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 856049e02..9b52d5fe9 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -88,7 +88,7 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
     typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::ret>::type,
     typename ei_cleantype<typename ei_conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::ret>::type,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(const_cast<Scalar*>(_expression().data()),_expression().stride(),actualU,actualV,actualAlpha);
+    ::run(const_cast<Scalar*>(_expression().data()),_expression().outerStride(),actualU,actualV,actualAlpha);
 
   return *this;
 }
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 27c7caf17..040b9d5cd 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -166,7 +166,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,true,
             for (int i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
               triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
           }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.stride(), actualPanelWidth, actualPanelWidth);
+          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
 
           gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols,
                       actualPanelWidth, actual_kc, 0, blockBOffset);
@@ -286,7 +286,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
           }
 
           pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
-                         triangularBuffer.data(), triangularBuffer.stride(), alpha,
+                         triangularBuffer.data(), triangularBuffer.outerStride(), alpha,
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
         }
@@ -354,9 +354,9 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
       (ei_traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
       ::run(
         lhs.rows(), LhsIsTriangular ? rhs.cols() : lhs.rows(),           // sizes
-        &lhs.coeff(0,0),    lhs.stride(), // lhs info
-        &rhs.coeff(0,0),    rhs.stride(), // rhs info
-        &dst.coeffRef(0,0), dst.stride(), // result info
+        &lhs.coeff(0,0),    lhs.outerStride(), // lhs info
+        &rhs.coeff(0,0),    rhs.outerStride(), // rhs info
+        &dst.coeffRef(0,0), dst.outerStride(), // result info
         actualAlpha                       // alpha
       );
   }
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 2cad48eb9..ee4c45c35 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -63,7 +63,7 @@ struct ei_product_triangular_vector_selector<Lhs,Rhs,Result,Mode,ConjLhs,ConjRhs
         int s = IsLower ? pi+actualPanelWidth : 0;
         ei_cache_friendly_product_colmajor_times_vector<ConjLhs,ConjRhs>(
             r,
-            &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.stride(),
+            &(lhs.const_cast_derived().coeffRef(s,pi)), lhs.outerStride(),
             rhs.segment(pi, actualPanelWidth),
             &(res.coeffRef(s)),
             alpha);
@@ -105,7 +105,7 @@ struct ei_product_triangular_vector_selector<Lhs,Rhs,Result,Mode,ConjLhs,ConjRhs
         int s = IsLower ? 0 : pi + actualPanelWidth;
         Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1);
         ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>(
-            &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.stride(),
+            &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
             &(rhs.const_cast_derived().coeffRef(s)), r,
             target, alpha);
       }

From 3130b7a72218299d4cac5efa6562d0e2e327d71d Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 6 Mar 2010 22:57:50 +0100
Subject: [PATCH 120/122] bugcount--, this time trmm

---
 Eigen/src/Core/products/TriangularMatrixMatrix.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 040b9d5cd..53e7876c1 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -258,7 +258,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
       const int actual_kc = std::min(IsLower ? size-k2 : k2, kc);
       int actual_k2 = IsLower ? k2 : k2-actual_kc;
       int rs = IsLower ? actual_k2 : size - k2;
-      Scalar* geb = blockB+actual_kc*actual_kc/**Blocking::PacketSize*/;
+      Scalar* geb = blockB+actual_kc*actual_kc;
 
       pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, alpha, actual_kc, rs);
 
@@ -271,7 +271,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
           int panelOffset = IsLower ? j2+actualPanelWidth : 0;
           int panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
-          pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
+          pack_rhs_panel(blockB+j2*actual_kc,
                          &rhs(actual_k2+panelOffset, actual_j2), rhsStride, alpha,
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
@@ -285,7 +285,7 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
               triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
           }
 
-          pack_rhs_panel(blockB+j2*actual_kc/**Blocking::PacketSize*/,
+          pack_rhs_panel(blockB+j2*actual_kc,
                          triangularBuffer.data(), triangularBuffer.outerStride(), alpha,
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
@@ -309,11 +309,13 @@ struct ei_product_triangular_matrix_matrix<Scalar,Mode,false,
                         blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset);// offsets
+                        blockOffset, blockOffset,// offsets
+                        allocatedBlockB); // workspace
           }
         }
         gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
-                    blockA, geb, actual_mc, actual_kc, rs);
+                    blockA, geb, actual_mc, actual_kc, rs,
+                    -1, -1, 0, 0, allocatedBlockB);
       }
     }
 

From aeea00a9cfd36ac544018e9318d133647547f8b4 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sun, 7 Mar 2010 12:32:24 +0100
Subject: [PATCH 121/122] fix compilation

---
 test/mixingtypes.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index 8b8e8302e..c6cf00d28 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -127,8 +127,8 @@ void mixingtypes_large(int size)
   VERIFY_RAISES_ASSERT(mcd*md);
   VERIFY_RAISES_ASSERT(mf*vcf);
   VERIFY_RAISES_ASSERT(mcf*vf);
-  VERIFY_RAISES_ASSERT(mcf *= mf);
-  // VERIFY_RAISES_ASSERT(vcd = md*vcd); // does not even compile (cannot convert complex to double)
+//   VERIFY_RAISES_ASSERT(mcf *= mf); // does not even compile
+//   VERIFY_RAISES_ASSERT(vcd = md*vcd); // does not even compile (cannot convert complex to double)
   VERIFY_RAISES_ASSERT(vcf = mcf*vf);
 
 //   VERIFY_RAISES_ASSERT(mf*md);       // does not even compile

From 9fe040ad29400f152b392fff9dc1493a6b9c14aa Mon Sep 17 00:00:00 2001
From: Hauke Heibel <hauke.heibel@gmail.com>
Date: Sun, 7 Mar 2010 14:05:26 +0100
Subject: [PATCH 122/122] Reintroduced the if-clause for MSVC ei_ploadu via
 _loadu_.

---
 Eigen/src/Core/arch/SSE/PacketMath.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 282a1971c..77f15d982 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -184,17 +184,17 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) {
 template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
 
-// #if (!defined __GNUC__) && (!defined __ICC)
-// template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
-// template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-// template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
-// #else
-
+#if defined(_MSC_VER)
+  template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); }
+  template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
+  template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+#else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
 // because of the strict aliasing rule. The "dummy" stuff are required to enforce
 // a correct instruction dependency.
 // TODO: do the same for MSVC (ICC is compatible)
+// NOTE: with the code below, MSVC's compiler crashes!
 template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
@@ -219,6 +219,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
   res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
   return _mm_castpd_si128(res);
 }
+#endif
 
 template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }