merge with default branch

2025-09-10 16:33:15 +08:00 · 2014-06-20 15:55:44 +02:00 · 2014-06-20 15:55:44 +02:00 · b29b81a1f4
commit b29b81a1f4
parent 47585c8ab2 963d338922
158 changed files with 4721 additions and 1751 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -196,6 +196,18 @@ if(NOT MSVC)
    message(STATUS "Enabling SSE4.2 in tests/examples")
  endif()
  option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" OFF)
  if(EIGEN_TEST_AVX)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
    message(STATUS "Enabling AVX in tests/examples")
  endif()
  option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF)
  if(EIGEN_TEST_FMA)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
    message(STATUS "Enabling FMA in tests/examples")
  endif()
  option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
  if(EIGEN_TEST_ALTIVEC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
@ -437,12 +449,12 @@ set ( EIGEN_INCLUDE_DIR ${INCLUDE_INSTALL_DIR} )
 set ( EIGEN_INCLUDE_DIRS ${EIGEN_INCLUDE_DIR} )
 set ( EIGEN_ROOT_DIR ${CMAKE_INSTALL_PREFIX} )
-configure_file ( ${CMAKE_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
+configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
                 ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
                 @ONLY ESCAPE_QUOTES
               )
-install ( FILES ${CMAKE_SOURCE_DIR}/cmake/UseEigen3.cmake
+install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
                ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
          DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
        )
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@ -10,9 +10,11 @@
  *
  *
  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following MatrixBase methods:
+  * Those decompositions are also accessible via the following methods:
-  *  - MatrixBase::llt(),
+  *  - MatrixBase::llt()
  *  - MatrixBase::ldlt()
  *  - SelfAdjointView::llt()
  *  - SelfAdjointView::ldlt()
  *
  * \code
  * #include <Eigen/Cholesky>
--- a/Eigen/Core
+++ b/Eigen/Core
@ -117,7 +117,16 @@
    #ifdef __SSE4_2__
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
-
+    #ifdef __AVX__
      #define EIGEN_VECTORIZE_AVX
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __FMA__
      #define EIGEN_VECTORIZE_FMA
    #endif
    // include files
    // This extern "C" works around a MINGW-w64 compilation issue
@ -130,7 +139,7 @@
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #ifdef __INTEL_COMPILER
+      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
        #include <immintrin.h>
      #else
        #include <emmintrin.h>
@ -147,6 +156,9 @@
        #ifdef EIGEN_VECTORIZE_SSE4_2
        #include <nmmintrin.h>
        #endif
        #ifdef EIGEN_VECTORIZE_AVX
        #include <immintrin.h>
        #endif
      #endif
    } // end extern "C"
  #elif defined __ALTIVEC__
@ -216,7 +228,9 @@
 namespace Eigen {
 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_AVX)
  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_2)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@ -294,7 +308,13 @@ using std::ptrdiff_t;
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
-#if defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_AVX
  // Use AVX for floats and doubles, SSE for integers
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/Complex.h"
  #include "src/Core/arch/AVX/PacketMath.h"
  #include "src/Core/arch/AVX/Complex.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/SSE/Complex.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@ -47,7 +47,9 @@
  #include "src/Geometry/AlignedBox.h"
  #include "src/Geometry/Umeyama.h"
-  #if defined EIGEN_VECTORIZE_SSE
+  // Use the SSE optimized version whenever possible. At the moment the
  // SSE version doesn't compile when AVX is enabled
  #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
    #include "src/Geometry/arch/Geometry_SSE.h"
  #endif
 #endif
--- a/Eigen/LU
+++ b/Eigen/LU
@ -27,7 +27,9 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
 // SSE version doesn't compile when AVX is enabled
 #if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
  #include "src/LU/arch/Inverse_SSE.h"
 #endif
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -43,7 +43,7 @@ namespace internal {
  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
  * decomposition to determine whether a system of equations has a solution.
  *
-  * \sa MatrixBase::ldlt(), class LLT
+  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
  */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@ -179,7 +179,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
      *
-      * \sa MatrixBase::ldlt()
+      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
      */
 #ifdef EIGEN_TEST_EVALUATORS
    template<typename Rhs>
@ -610,6 +610,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
 #ifndef __CUDACC__
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
  * \sa MatrixBase::ldlt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@ -620,6 +621,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const
 /** \cholesky_module
  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
  * \sa SelfAdjointView::ldlt()
  */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *    
-  * \sa MatrixBase::llt(), class LDLT
+  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@ -115,7 +115,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      * Example: \include LLT_solve.cpp
      * Output: \verbinclude LLT_solve.out
      *
-      * \sa solveInPlace(), MatrixBase::llt()
+      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
      */
 #ifdef EIGEN_TEST_EVALUATORS
    template<typename Rhs>
@ -498,6 +498,7 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
 #ifndef __CUDACC__
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
  * \sa SelfAdjointView::llt()
  */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@ -508,6 +509,7 @@ MatrixBase<Derived>::llt() const
 /** \cholesky_module
  * \returns the LLT decomposition of \c *this
  * \sa SelfAdjointView::llt()
  */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -105,6 +105,8 @@ public:
    EIGEN_DEBUG_VAR(DstIsAligned)
    EIGEN_DEBUG_VAR(SrcIsAligned)
    EIGEN_DEBUG_VAR(JointAlignment)
    EIGEN_DEBUG_VAR(Derived::SizeAtCompileTime)
    EIGEN_DEBUG_VAR(OtherDerived::CoeffReadCost)
    EIGEN_DEBUG_VAR(InnerSize)
    EIGEN_DEBUG_VAR(InnerMaxSize)
    EIGEN_DEBUG_VAR(PacketSize)
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@ -202,6 +202,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(atan,  Atan)
 //EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
 EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@ -88,8 +88,8 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
                       && (InnerStrideAtCompileTime == 1)
                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
+    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -40,7 +40,7 @@ void check_static_allocation_size()
  */
 template <typename T, int Size, int MatrixOrArrayOptions,
          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%16)==0) ? 16
+                        : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES
                        : 0 >
 struct plain_array
 {
@ -68,27 +68,27 @@ struct plain_array
  template<typename PtrType>
  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #else
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
              && "this assertion is explained here: " \
              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
              " **** READ THIS WEB PAGE !!! ****");
 #endif
 template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 16>
+struct plain_array<T, Size, MatrixOrArrayOptions, EIGEN_ALIGN_BYTES>
 {
-  EIGEN_USER_ALIGN16 T array[Size];
+  EIGEN_USER_ALIGN_DEFAULT T array[Size];
  EIGEN_DEVICE_FUNC
  plain_array() 
  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1);
    check_static_allocation_size<T,Size>();
  }
@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN16 T array[1];
+  EIGEN_USER_ALIGN_DEFAULT T array[1];
  EIGEN_DEVICE_FUNC plain_array() {}
  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -69,8 +69,7 @@ template<typename Lhs, typename Rhs> struct product_type
    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
                                           _Rhs::MaxRowsAtCompileTime),
    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime),
+                                        _Rhs::RowsAtCompileTime)
    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
  };
  // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@ -424,7 +423,7 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
  EIGEN_STRONG_INLINE Scalar* data() {
    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
            : m_data.array;
  }
  #endif
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -42,6 +42,8 @@ namespace internal {
 struct default_packet_traits
 {
  enum {
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
@ -71,10 +73,12 @@ struct default_packet_traits
 template<typename T> struct packet_traits : default_packet_traits
 {
  typedef T type;
  typedef T half;
  enum {
    Vectorizable = 0,
    size = 1,
-    AlignedOnScalar = 0
+    AlignedOnScalar = 0,
    HasHalfPacket = 0
  };
  enum {
    HasAdd    = 0,
@ -157,18 +161,66 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 /** \internal \returns a packet with elements of \a *from duplicated.
  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
 */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
 /** \internal \returns a packet with elements of \a *from duplicated.
  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 /** \internal \returns a packet with elements of \a *from quadrupled.
  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
  * Currently, this function is only used in matrix products.
  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadquad(const typename unpacket_traits<Packet>::type* from)
 { return pload1<Packet>(from); }
 /** \internal equivalent to
  * \code
  * a0 = pload1(a+0);
  * a1 = pload1(a+1);
  * a2 = pload1(a+2);
  * a3 = pload1(a+3);
  * \endcode
  * \sa pset1, pload1, ploaddup, pbroadcast2
  */
 template<typename Packet> EIGEN_DEVICE_FUNC
 inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
  a0 = pload1<Packet>(a+0);
  a1 = pload1<Packet>(a+1);
  a2 = pload1<Packet>(a+2);
  a3 = pload1<Packet>(a+3);
 }
 /** \internal equivalent to
  * \code
  * a0 = pload1(a+0);
  * a1 = pload1(a+1);
  * \endcode
  * \sa pset1, pload1, ploaddup, pbroadcast4
  */
 template<typename Packet> EIGEN_DEVICE_FUNC
 inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
                        Packet& a0, Packet& a1)
 {
  a0 = pload1<Packet>(a+0);
  a1 = pload1<Packet>(a+1);
 }
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
 template<typename Scalar> inline typename packet_traits<Scalar>::type
 plset(const Scalar& a) { return a; }
@ -179,7 +231,13 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+{  (*to) = from; }
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, int /*stride*/)
 { return ploadu<Packet>(from); }
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, int /*stride*/)
 { pstore(to, from); }
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
@ -201,6 +259,15 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }
 /** \internal \returns the sum of the elements of \a a by block of 4 elements.
  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
  * For packet-size smaller or equal to 4, this boils down to a noop.
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
 predux4(const Packet& a)
 { return a; }
 /** \internal \returns the product of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }
@ -251,6 +318,10 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }
 /** \internal \returns the atan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan(const Packet& a) { using std::atan; return atan(a); }
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@ -348,9 +419,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const
 #endif
 /***************************************************************************
 * PacketBlock, that is a collection of N packets where the number of words
 * in the packet is a multiple of N.
 ***************************************************************************/
 template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
  Packet packet[N];
 };
 template<typename Packet> EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_GENERIC_PACKET_MATH_H
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -45,6 +45,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -89,7 +89,7 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
                        && ( bool(IsDynamicSize)
                           || HasNoOuterStride
                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -166,11 +166,10 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0) && "data is not aligned");
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
 #else
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % 16) == 0) && "data is not aligned");
+      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::IsAligned, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned");
 #endif
    }
    PointerType m_data;
@ -255,6 +254,8 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    using Base::Base::operator=;
 };
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
 } // end namespace Eigen
 #endif // EIGEN_MAPBASE_H
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -669,6 +669,15 @@ bool (isfinite)(const T& x)
  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
 }
 template<typename T>
 EIGEN_DEVICE_FUNC
 bool (isfinite)(const std::complex<T>& x)
 {
  using std::real;
  using std::imag;
  return isfinite(real(x)) && isfinite(imag(x));
 }
 } // end namespace numext
 namespace internal {
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -325,10 +325,16 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
  };
  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
  {
-    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    if (VectorizedSize != Size)
+    if (VectorizedSize > 0) {
-      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-    return res;
+      if (VectorizedSize != Size)
        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
      return res;
    }
    else {
      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
    }
  }
 };
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -97,7 +97,7 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
  template<typename Derived> struct match {
    enum {
      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
@ -168,8 +168,12 @@ protected:
    }
    else
      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+    
-                                 StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
    else
      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
  }
  StrideBase m_stride;
--- a/Eigen/src/Core/arch/AVX/CMakeLists.txt
+++ b/Eigen/src/Core/arch/AVX/CMakeLists.txt
@ -0,0 +1,6 @@
 FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
 INSTALL(FILES
  ${Eigen_Core_arch_AVX_SRCS}
  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
 )
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@ -0,0 +1,463 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_COMPLEX_AVX_H
 #define EIGEN_COMPLEX_AVX_H
 namespace Eigen {
 namespace internal {
 //---------- float ----------
 struct Packet4cf
 {
  EIGEN_STRONG_INLINE Packet4cf() {}
  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
  __m256  v;
 };
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet4cf type;
  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 4,
    HasHalfPacket = 1,
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasSetLinear = 0
  };
 };
 template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; typedef Packet2cf half; };
 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
 {
  return Packet4cf(pnegate(a.v));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
 {
  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
  return Packet4cf(_mm256_xor_ps(a.v,mask));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
  return Packet4cf(result);
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
 {
  // FIXME The following might be optimized using _mm256_movedup_pd
  Packet2cf a = ploaddup<Packet2cf>(from);
  Packet2cf b = ploaddup<Packet2cf>(from+1);
  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
 }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
 template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, int stride)
 {
  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
                                 std::imag(from[2*stride]), std::real(from[2*stride]),
                                 std::imag(from[1*stride]), std::real(from[1*stride]),
                                 std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, int stride)
 {
  __m128 low = _mm256_extractf128_ps(from.v, 0);
  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
  __m128 high = _mm256_extractf128_ps(from.v, 1);
  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
 }
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
 {
  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
  __m128 low  = _mm256_extractf128_ps(a.v, 0);
  __m128 high = _mm256_extractf128_ps(a.v, 1);
  __m128d lowd  = _mm_castps_pd(low);
  __m128d highd = _mm_castps_pd(high);
  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
  __m256 result = _mm256_setzero_ps();
  result = _mm256_insertf128_ps(result, low, 1);
  result = _mm256_insertf128_ps(result, high, 0);
  return Packet4cf(result);
 }
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
 {
  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
 {
  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
  t0 = _mm256_hadd_ps(t0,t1);
  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
  t2 = _mm256_hadd_ps(t2,t3);
  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
  return Packet4cf(_mm256_add_ps(t1,t3));
 }
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
 {
  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet4cf>
 {
  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 template<> struct conj_helper<Packet8f, Packet4cf, false,false>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
 };
 template<> struct conj_helper<Packet4cf, Packet8f, false,false>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
  Packet4cf num = pmul(a, pconj(b));
  __m256 tmp = _mm256_mul_ps(b.v, b.v);
  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
  __m256 denom = _mm256_add_ps(tmp, tmp2);
  return Packet4cf(_mm256_div_ps(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
 {
  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
 }
 //---------- double ----------
 struct Packet2cd
 {
  EIGEN_STRONG_INLINE Packet2cd() {}
  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
  __m256d  v;
 };
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet2cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 2,
    HasHalfPacket = 1,
    HasAdd    = 1,
    HasSub    = 1,
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasSetLinear = 0
  };
 };
 template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; typedef Packet1cd half; };
 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
 {
  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
  return Packet2cd(_mm256_xor_pd(a.v,mask));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
  __m256d even = _mm256_mul_pd(tmp1, b.v);
  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
  return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
 {
  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
 //   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, int stride)
 {
  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
 				 std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, int stride)
 {
  __m128d low = _mm256_extractf128_pd(from.v, 0);
  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
  __m128d high = _mm256_extractf128_pd(from.v, 1);
  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
 {
  __m128d low = _mm256_extractf128_pd(a.v, 0);
  EIGEN_ALIGN16 double res[2];
  _mm_store_pd(res, low);
  return std::complex<double>(res[0],res[1]);
 }
 template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
  return Packet2cd(result);
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
 {
  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
 {
  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
  return Packet2cd(_mm256_add_pd(t0,t1));
 }
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
 {
  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet2cd>
 {
  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
  {
    if (Offset==0) return;
    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 template<> struct conj_helper<Packet4d, Packet2cd, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cd, Packet4d, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
  Packet2cd num = pmul(a, pconj(b));
  __m256d tmp = _mm256_mul_pd(b.v, b.v);
  __m256d denom = _mm256_hadd_pd(tmp, tmp);
  return Packet2cd(_mm256_div_pd(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
 {
  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4cf,4>& kernel) {
  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cd,2>& kernel) {
  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
 kernel.packet[0].v = tmp;
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_COMPLEX_AVX_H
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -0,0 +1,564 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_PACKET_MATH_AVX_H
 #define EIGEN_PACKET_MATH_AVX_H
 namespace Eigen {
 namespace internal {
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 #ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_FUSED_MADD
 #define EIGEN_HAS_FUSED_MADD 1
 #endif
 #endif
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
 #define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet8f type;
  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=8,
    HasHalfPacket = 1,
    HasDiv  = 1,
    HasSin  = 0,
    HasCos  = 0,
    HasLog  = 0,
    HasExp  = 0,
    HasSqrt = 0
  };
 };
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet4d type;
  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
    HasHalfPacket = 1,
    HasDiv  = 1,
    HasExp  = 0
  };
 };
 /* Proper support for integers is only provided by AVX2. In the meantime, we'll
   use SSE instructions and packets to deal with integers.
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet8i type;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=8
  };
 };
 */
 template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8}; };
 template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4}; };
 template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8}; };
 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
 template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
 template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
 {
  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
 }
 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by AVX");
  return pset1<Packet8i>(0);
 }
 #ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if defined(__clang__) || defined(__GNUC__)
  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
  // and gcc stupidly generates a vfmadd132ps instruction,
  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
  // the result of the product.
  Packet8f res = c;
  asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
  return res;
 #else
  return _mm256_fmadd_ps(a,b,c);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
 #if defined(__clang__) || defined(__GNUC__)
  // see above
  Packet4d res = c;
  asm("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
  return res;
 #else
  return _mm256_fmadd_pd(a,b,c);
 #endif
 }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 {
  // TODO try to find a way to avoid the need of a temporary register
 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
 //   return _mm256_unpacklo_ps(tmp,tmp);
  // _mm256_insertf128_ps is very slow on Haswell, thus:
  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
  // mimic an "inplace" permutation of the lower 128bits using a blend
  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
  // then we can perform a consistent permutation on the global register to get everything in shape:
  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
 }
 // Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
 template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
 {
  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
  return  _mm256_permute_pd(tmp, 3<<2);
 }
 // Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
 template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
 {
  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
 }
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, int stride)
 {
  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, int stride)
 {
  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, int stride)
 {
  __m128 low = _mm256_extractf128_ps(from, 0);
  to[stride*0] = _mm_cvtss_f32(low);
  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
  __m128 high = _mm256_extractf128_ps(from, 1);
  to[stride*4] = _mm_cvtss_f32(high);
  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, int stride)
 {
  __m128d low = _mm256_extractf128_pd(from, 0);
  to[stride*0] = _mm_cvtsd_f64(low);
  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
  __m128d high = _mm256_extractf128_pd(from, 1);
  to[stride*2] = _mm_cvtsd_f64(high);
  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
 }
 template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
 {
  Packet8f pa = pset1<Packet8f>(a);
  pstore(to, pa);
 }
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
 {
  Packet4d pa = pset1<Packet4d>(a);
  pstore(to, pa);
 }
 template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
 {
  Packet8i pa = pset1<Packet8i>(a);
  pstore(to, pa);
 }
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
 }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
 }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
 }
 template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
 {
  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
  return _mm256_permute2f128_ps(tmp, tmp, 1);
 }
 template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
 {
   __m256d tmp = _mm256_shuffle_pd(a,a,5);
  return _mm256_permute2f128_pd(tmp, tmp, 1);
  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
    return _mm256_permute_pd(swap_halves,5);
 }
 // pabs should be ok
 template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
 {
  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
  return _mm256_and_ps(a,mask);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
 {
  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
  return _mm256_and_pd(a,mask);
 }
 // preduxp should be ok
 // FIXME: why is this ok? why isn't the simply implementation working as expected?
 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
 {
    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
    return final;
 }
 template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
 {
 Packet4d tmp0, tmp1;
  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
  return _mm256_blend_pd(tmp0, tmp1, 0xC);
 }
 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
 {
  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
 {
  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
 }
 template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
 {
  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
 {
  Packet8f tmp;
  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
 {
  Packet4d tmp;
  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
 }
 template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
 {
  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
 {
  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
 {
  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
 {
  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 template<int Offset>
 struct palign_impl<Offset,Packet8f>
 {
  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
  {
    if (Offset==1)
    {
      first = _mm256_blend_ps(first, second, 1);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
    }
    else if (Offset==2)
    {
      first = _mm256_blend_ps(first, second, 3);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
    }
    else if (Offset==3)
    {
      first = _mm256_blend_ps(first, second, 7);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
      first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
    }
    else if (Offset==4)
    {
      first = _mm256_blend_ps(first, second, 15);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
      first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
    }
    else if (Offset==5)
    {
      first = _mm256_blend_ps(first, second, 31);
      first = _mm256_permute2f128_ps(first, first, 1);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
      first = _mm256_permute2f128_ps(tmp, tmp, 1);
      first = _mm256_blend_ps(tmp, first, 0x88);
    }
    else if (Offset==6)
    {
      first = _mm256_blend_ps(first, second, 63);
      first = _mm256_permute2f128_ps(first, first, 1);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
      first = _mm256_permute2f128_ps(tmp, tmp, 1);
      first = _mm256_blend_ps(tmp, first, 0xcc);
    }
    else if (Offset==7)
    {
      first = _mm256_blend_ps(first, second, 127);
      first = _mm256_permute2f128_ps(first, first, 1);
      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
      first = _mm256_permute2f128_ps(tmp, tmp, 1);
      first = _mm256_blend_ps(tmp, first, 0xee);
    }
  }
 };
 template<int Offset>
 struct palign_impl<Offset,Packet4d>
 {
  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
  {
    if (Offset==1)
    {
      first = _mm256_blend_pd(first, second, 1);
      __m256d tmp = _mm256_permute_pd(first, 5);
      first = _mm256_permute2f128_pd(tmp, tmp, 1);
      first = _mm256_blend_pd(tmp, first, 0xA);
    }
    else if (Offset==2)
    {
      first = _mm256_blend_pd(first, second, 3);
      first = _mm256_permute2f128_pd(first, first, 1);
    }
    else if (Offset==3)
    {
      first = _mm256_blend_pd(first, second, 7);
      __m256d tmp = _mm256_permute_pd(first, 5);
      first = _mm256_permute2f128_pd(tmp, tmp, 1);
      first = _mm256_blend_pd(tmp, first, 5);
    }
  }
 };
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,8>& kernel) {
  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,4>& kernel) {
  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4d,4>& kernel) {
  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_PACKET_MATH_AVX_H
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@ -21,6 +21,8 @@ static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);
 static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
 static Packet16uc p16uc_COMPLEX_TRANSPOSE_0 = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
 static Packet16uc p16uc_COMPLEX_TRANSPOSE_1 = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 //---------- float ----------
 struct Packet2cf
@ -33,6 +35,7 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@ -51,7 +54,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@ -65,6 +68,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
  return res;
 }
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return Packet2cf(vec_ld(0, (const float*)af));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
 {
  std::complex<float> EIGEN_ALIGN16 af[2];
  vec_st(from.v, 0, (float*)af);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
 }
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
@ -210,6 +229,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x
  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
 }
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_0);
  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_COMPLEX_TRANSPOSE_1);
  kernel.packet[0].v = tmp;
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -18,6 +18,10 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 #ifndef EIGEN_HAS_FUSED_MADD
 #define EIGEN_HAS_FUSED_MADD 1
 #endif
 #ifndef EIGEN_HAS_FUSE_CJMADD
 #define EIGEN_HAS_FUSE_CJMADD 1
 #endif
@ -73,6 +77,7 @@ static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@ -89,6 +94,7 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@ -97,8 +103,8 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
 /*
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
@ -144,6 +150,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
  return s;
 }
 */
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  float EIGEN_ALIGN16 af[4];
@ -161,6 +168,65 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
  return vc;
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
 {
  a3 = vec_ld(0,a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
  a3 = vec_splat(a3, 3);
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4i>(const int *a,
                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
 {
  a3 = vec_ld(0,a);
  a0 = vec_splat(a3, 0);
  a1 = vec_splat(a3, 1);
  a2 = vec_splat(a3, 2);
  a3 = vec_splat(a3, 3);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
 {
  float EIGEN_ALIGN16 af[4];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  af[2] = from[2*stride];
  af[3] = from[3*stride];
 return vec_ld(0, af);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
 {
  int EIGEN_ALIGN16 ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
  ai[3] = from[3*stride];
 return vec_ld(0, ai);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
 {
  float EIGEN_ALIGN16 af[4];
  vec_st(from, 0, af);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
  to[2*stride] = af[2];
  to[3*stride] = af[3];
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
 {
  int EIGEN_ALIGN16 ai[4];
  vec_st(from, 0, ai);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
  to[2*stride] = ai[2];
  to[3*stride] = ai[3];
 }
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
@ -494,6 +560,32 @@ struct palign_impl<Offset,Packet4i>
  }
 };
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
  Packet4f t0, t1, t2, t3;
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
  kernel.packet[0] = vec_mergeh(t0, t2);
  kernel.packet[1] = vec_mergel(t0, t2);
  kernel.packet[2] = vec_mergeh(t1, t3);
  kernel.packet[3] = vec_mergel(t1, t3);
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
  Packet4i t0, t1, t2, t3;
  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
  kernel.packet[0] = vec_mergeh(t0, t2);
  kernel.packet[1] = vec_mergel(t0, t2);
  kernel.packet[2] = vec_mergeh(t1, t3);
  kernel.packet[3] = vec_mergel(t1, t3);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/CMakeLists.txt
+++ b/Eigen/src/Core/arch/CMakeLists.txt
@ -1,4 +1,5 @@
 ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
 ADD_SUBDIRECTORY(NEON)
 ADD_SUBDIRECTORY(AVX)
 ADD_SUBDIRECTORY(Default)
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -28,6 +28,7 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@ -46,7 +47,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
  };
 };
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@ -110,6 +111,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
 {
  Packet4f res;
  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
  return Packet2cf(res);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
 {
  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
@ -246,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
  float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
  kernel.packet[1].v = tmp;
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -66,6 +66,7 @@ typedef uint32x4_t  Packet4ui;
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@ -83,6 +84,7 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
@ -95,12 +97,13 @@ template<> struct packet_traits<int>    : default_packet_traits
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
@ -219,6 +222,40 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
 {
  Packet4f res;
  res = vsetq_lane_f32(from[0*stride], res, 0);
  res = vsetq_lane_f32(from[1*stride], res, 1);
  res = vsetq_lane_f32(from[2*stride], res, 2);
  res = vsetq_lane_f32(from[3*stride], res, 3);
  return res;
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
 {
  Packet4i res;
  res = vsetq_lane_s32(from[0*stride], res, 0);
  res = vsetq_lane_s32(from[1*stride], res, 1);
  res = vsetq_lane_s32(from[2*stride], res, 2);
  res = vsetq_lane_s32(from[3*stride], res, 3);
  return res;
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
 {
  to[stride*0] = vgetq_lane_f32(from, 0);
  to[stride*1] = vgetq_lane_f32(from, 1);
  to[stride*2] = vgetq_lane_f32(from, 2);
  to[stride*3] = vgetq_lane_f32(from, 3);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
 {
  to[stride*0] = vgetq_lane_s32(from, 0);
  to[stride*1] = vgetq_lane_s32(from, 1);
  to[stride*2] = vgetq_lane_s32(from, 2);
  to[stride*3] = vgetq_lane_s32(from, 3);
 }
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
@ -385,6 +422,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
  a_lo = vget_low_s32(a);
  a_hi = vget_high_s32(a);
  max = vpmax_s32(a_lo, a_hi);
  max = vpmax_s32(max, max);
  return vget_lane_s32(max, 0);
 }
@ -413,6 +451,27 @@ PALIGN_NEON(3,Packet4i,vextq_s32)
 #undef PALIGN_NEON
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@ -22,13 +22,18 @@ struct Packet2cf
  __m128  v;
 };
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
  typedef Packet2cf type;
  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
@ -42,8 +47,9 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
 #endif
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; typedef Packet2cf half; };
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, int stride)
 {
  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
                              std::imag(from[0*stride]), std::real(from[0*stride])));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, int stride)
 {
  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
  #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@ -248,13 +269,18 @@ struct Packet1cd
  __m128d  v;
 };
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
  typedef Packet1cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
    HasHalfPacket = 0,
    HasAdd    = 1,
    HasSub    = 1,
@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasSetLinear = 0
  };
 };
 #endif
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; typedef Packet1cd half; };
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 {
  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@ -311,8 +338,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@ -410,7 +437,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
 };
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@ -419,7 +446,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@ -432,7 +459,17 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
-  return Packet1cd(preverse(x.v));
+  return Packet1cd(preverse(Packet2d(x.v)));
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2cf,2>& kernel) {
  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
  kernel.packet[1].v = tmp;
 }
 } // end namespace internal
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
  x = _mm_or_ps(x, p4f_half);
  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
  /* part2:
     if( x < SQRTHF ) {
@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
     } else { x = x - 1.0; }
  */
  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = _mm_and_ps(x, mask);
+  Packet4f tmp = pand(x, mask);
  x = psub(x, p4f_1);
-  e = psub(e, _mm_and_ps(p4f_1, mask));
+  e = psub(e, pand(p4f_1, mask));
  x = padd(x, tmp);
  Packet4f x2 = pmul(x,x);
@ -167,7 +167,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  emm0 = _mm_cvttps_epi32(fx);
  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmul(y, _mm_castsi128_ps(emm0));
+  return pmul(y, Packet4f(_mm_castsi128_ps(emm0)));
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
@ -241,7 +241,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
  emm0 = _mm_slli_epi32(emm0, 20);
  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(x, _mm_castsi128_pd(emm0));
+  return pmul(x, Packet2d(_mm_castsi128_pd(emm0)));
 }
 /* evaluation of 4 sines at onces, using SSE2 intrinsics.
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -22,9 +22,41 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 #ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_FUSED_MADD
 #define EIGEN_HAS_FUSED_MADD 1
 #endif
 #endif
 #if defined EIGEN_VECTORIZE_AVX && defined __GNUC__ && !(defined __clang__ || defined __INTEL_COMPILER)
 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
 // have overloads for both types without linking error.
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
 // To workaround this inconvenince, we rather wrap 128bit types into the following helper
 // structure:
 // TODO disable this wrapper if abi-versio>=4, but to detect that without asking the user to define a macro?
 template<typename T>
 struct eigen_packet_wrapper
 {
  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
    m_val = v;
    return *this;
  }
  T m_val;
 };
 typedef eigen_packet_wrapper<__m128>  Packet4f;
 typedef eigen_packet_wrapper<__m128i> Packet4i;
 typedef eigen_packet_wrapper<__m128d> Packet2d;
 #else
 typedef __m128  Packet4f;
 typedef __m128i Packet4i;
 typedef __m128d Packet2d;
 #endif
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@ -58,13 +90,18 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<float>  : default_packet_traits
 {
  typedef Packet4f type;
  typedef Packet4f half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=4,
    HasHalfPacket = 0,
    HasDiv  = 1,
    HasSin  = EIGEN_FAST_MATH,
@ -77,19 +114,23 @@ template<> struct packet_traits<float>  : default_packet_traits
 template<> struct packet_traits<double> : default_packet_traits
 {
  typedef Packet2d type;
  typedef Packet2d half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size=2,
    HasHalfPacket = 0,
    HasDiv  = 1,
    HasExp  = 1,
    HasSqrt = 1
  };
 };
 #endif
 template<> struct packet_traits<int>    : default_packet_traits
 {
  typedef Packet4i type;
  typedef Packet4i half;
  enum {
    // FIXME check the Has*
    Vectorizable = 1,
@ -98,9 +139,9 @@ template<> struct packet_traits<int>    : default_packet_traits
  };
 };
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; typedef Packet4i half; };
 #if defined(_MSC_VER) && (_MSC_VER==1500)
 // Workaround MSVC 9 internal compiler error.
@ -110,13 +151,26 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
 // Also note that with AVX, we want it to generate a vbroadcastss.
 #if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
 }
 #endif
 #ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@ -139,7 +193,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
-  return psub(_mm_setr_epi32(0,0,0,0), a);
+  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@ -173,6 +227,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
 #ifdef EIGEN_VECTORIZE_FMA
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@ -218,7 +276,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 #if defined(_MSC_VER)
  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
@ -236,7 +294,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
    #endif
  }
  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@ -245,14 +303,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
 // TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
-#if defined(__GNUC__) && defined(__i386__)
+#if defined(__GNUC__) && (defined(__i386__) || (defined(__x86_64) && EIGEN_GNUC_AT_LEAST(4, 8)))
  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 1
 #elif defined(__clang__)
  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #else
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
  #define EIGEN_AVOID_CUSTOM_UNALIGNED_STORES 0
 #endif
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
@ -283,7 +344,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 #else
  __m128d res;
  res =  _mm_load_sd((const double*)(from)) ;
@ -302,38 +363,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
  Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
  EIGEN_DEBUG_UNALIGNED_STORE
 #if EIGEN_AVOID_CUSTOM_UNALIGNED_STORES
  _mm_storeu_pd(to, from);
 #else
  _mm_storel_pd((to), from);
  _mm_storeh_pd((to+1), from);
 #endif
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castps_pd(from))); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), Packet2d(_mm_castsi128_pd(from))); }
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, int stride)
 {
 return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, int stride)
 {
 return _mm_set_pd(from[1*stride], from[0*stride]);
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, int stride)
 {
 return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, int stride)
 {
  to[stride*0] = _mm_cvtss_f32(from);
  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, int stride)
 {
  to[stride*0] = _mm_cvtsd_f64(from);
  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, int stride)
 {
  to[stride*0] = _mm_cvtsi128_si32(from);
  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
 {
  Packet4f pa = _mm_set_ss(a);
-  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
 {
  Packet2d pa = _mm_set_sd(a);
-  pstore(to, vec2d_swizzle1(pa,0,0));
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 #ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 #endif
 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
@ -380,6 +480,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
  #endif
 }
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
 {
  a3 = pload<Packet4f>(a);
  a0 = vec4f_swizzle1(a3, 0,0,0,0);
  a1 = vec4f_swizzle1(a3, 1,1,1,1);
  a2 = vec4f_swizzle1(a3, 2,2,2,2);
  a3 = vec4f_swizzle1(a3, 3,3,3,3);
 }
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet2d>(const double *a,
                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
 #ifdef EIGEN_VECTORIZE_SSE3
  a0 = _mm_loaddup_pd(a+0);
  a1 = _mm_loaddup_pd(a+1);
  a2 = _mm_loaddup_pd(a+2);
  a3 = _mm_loaddup_pd(a+3);
 #else
  a1 = pload<Packet2d>(a);
  a0 = vec2d_swizzle1(a1, 0,0);
  a1 = vec2d_swizzle1(a1, 1,1);
  a3 = pload<Packet2d>(a+2);
  a2 = vec2d_swizzle1(a3, 0,0);
  a3 = vec2d_swizzle1(a3, 1,1);
 #endif
 }
 #endif
 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 {
  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@ -407,10 +539,10 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst(_mm_hadd_ps(tmp0, tmp0));
+  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
 }
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
 // SSSE3 version:
 // EIGEN_STRONG_INLINE float predux(const Packet4i& a)
@ -453,7 +585,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@ -476,11 +608,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
@ -496,17 +628,17 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
 #else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
@ -522,17 +654,17 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
-  return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
 #else
  // after some experiments, it is seems this is the fastest way to implement it
  // for GCC (eg., it does not like using std::min after the pstore !!)
@ -652,6 +784,31 @@ struct palign_impl<Offset,Packet2d>
 };
 #endif
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2d,2>& kernel) {
  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
  kernel.packet[1] = tmp;
 }
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4i,4>& kernel) {
  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@ -320,6 +320,26 @@ struct functor_traits<scalar_asin_op<Scalar> >
  };
 };
 /** \internal
  * \brief Template functor to compute the atan of a scalar
  * \sa class CwiseUnaryOp, ArrayBase::atan()
  */
 template<typename Scalar> struct scalar_atan_op {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
  inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
  typedef typename packet_traits<Scalar>::type Packet;
  inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_atan_op<Scalar> >
 {
  enum {
    Cost = 5 * NumTraits<Scalar>::MulCost,
    PacketAccess = packet_traits<Scalar>::HasATan
  };
 };
 /** \internal
  * \brief Template functor to compute the inverse of a scalar
  * \sa class CwiseUnaryOp, Cwise::inverse()
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@ -23,6 +23,8 @@ template<
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
 {
  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols, Index depth,
@ -51,6 +53,8 @@ template<
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {
 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
  const LhsScalar* _lhs, Index lhsStride,
@ -63,11 +67,9 @@ static void run(Index rows, Index cols, Index depth,
  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
  Index kc = blocking.kc();                   // cache block size along the K direction
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  //Index nc = blocking.nc(); // cache block size along the N direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@ -80,13 +82,11 @@ static void run(Index rows, Index cols, Index depth,
    Index tid = omp_get_thread_num();
    Index threads = omp_get_num_threads();
-    std::size_t sizeA = kc*mc;
+    LhsScalar* blockA = blocking.blockA();
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    eigen_internal_assert(blockA!=0);
    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
-    RhsScalar* blockB = blocking.blockB();
+    std::size_t sizeB = kc*nc;
-    eigen_internal_assert(blockB!=0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
    for(Index k=0; k<depth; k+=kc)
@ -94,54 +94,56 @@ static void run(Index rows, Index cols, Index depth,
      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
      // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing A'.
+      // let's start by packing B'.
-      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
-      // Pack B_k to B' in a parallel fashion:
+      // Pack A_k to A' in a parallel fashion:
-      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
-      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
      // i.e., we test that info[tid].users equals 0.
      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
      while(info[tid].users!=0) {}
      info[tid].users += threads;
-      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
-      // Notify the other threads that the part B'_j is ready to go.
+      // Notify the other threads that the part A'_i is ready to go.
      info[tid].sync = k;
-      // Computes C_i += A' * B' per B'_j
+      // Computes C_i += A' * B' per A'_i
      for(Index shift=0; shift<threads; ++shift)
      {
-        Index j = (tid+shift)%threads;
+        Index i = (tid+shift)%threads;
-        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // At this point we have to make sure that A'_i has been updated by the thread i,
        // we use testAndSetOrdered to mimic a volatile access.
        // However, no need to wait for the B' part which has been updated by the current thread!
        if(shift>0)
-          while(info[j].sync!=k) {}
+          while(info[i].sync!=k) {}
-
+        gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
      }
-      // Then keep going as usual with the remaining A'
+      // Then keep going as usual with the remaining B'
-      for(Index i=mc; i<rows; i+=mc)
+      for(Index j=nc; j<cols; j+=nc)
      {
-        const Index actual_mc = (std::min)(i+mc,rows)-i;
+        const Index actual_nc = (std::min)(j+nc,cols)-j;
-        // pack A_i,k to A'
+        // pack B_k,j to B'
-        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
-        // C_i += A' * B'
+        // C_j += A' * B'
-        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+        gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
      }
-      // Release all the sub blocks B'_j of B' for the current thread,
+      // Release all the sub blocks A'_i of A' for the current thread,
      // i.e., we simply decrement the number of users by 1
-      for(Index j=0; j<threads; ++j)
+      #pragma omp critical
      {
      for(Index i=0; i<threads; ++i)
        #pragma omp atomic
-        --(info[j].users);
+        --(info[i].users);
      }
    }
  }
  else
@ -151,38 +153,34 @@ static void run(Index rows, Index cols, Index depth,
    // this is the sequential version!
    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
+    std::size_t sizeB = kc*nc;
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
    // (==GEMM_VAR1)
    for(Index k2=0; k2<depth; k2+=kc)
    {
      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
+      // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
-      // Note that this panel will be read as many times as the number of blocks in the lhs's
+      // Note that this panel will be read as many times as the number of blocks in the rhs's
-      // vertical panel which is, in practice, a very low number.
+      // horizontal panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
-      // For each mc x kc block of the lhs's vertical panel...
+      // For each kc x nc block of the rhs's horizontal panel...
-      // (==GEPP_VAR1)
+      for(Index j2=0; j2<cols; j2+=nc)
      for(Index i2=0; i2<rows; i2+=mc)
      {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
+        const Index actual_nc = (std::min)(j2+nc,cols)-j2;
-        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
+        // We pack the rhs's block into a sequential chunk of memory (L2 caching)
        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
+        // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
+        pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
-        // Everything is packed, we can now call the block * panel kernel:
+        // Everything is packed, we can now call the panel * block kernel:
-        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
      }
    }
  }
@ -203,14 +201,13 @@ struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
                  BlockingType& blocking)
    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
  {}
  void initParallelSession() const
  {
-    m_blocking.allocateB();
+    m_blocking.allocateA();
  }
  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@ -225,6 +222,8 @@ struct gemm_functor
              m_actualAlpha, m_blocking, info);
  }
  typedef typename Gemm::Traits Traits;
  protected:
    const Lhs& m_lhs;
    const Rhs& m_rhs;
@ -245,7 +244,6 @@ class level3_blocking
  protected:
    LhsScalar* m_blockA;
    RhsScalar* m_blockB;
    RhsScalar* m_blockW;
    DenseIndex m_mc;
    DenseIndex m_nc;
@ -254,7 +252,7 @@ class level3_blocking
  public:
    level3_blocking()
-      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
    {}
    inline DenseIndex mc() const { return m_mc; }
@ -263,7 +261,6 @@ class level3_blocking
    inline LhsScalar* blockA() { return m_blockA; }
    inline RhsScalar* blockB() { return m_blockB; }
    inline RhsScalar* blockW() { return m_blockW; }
 };
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
@ -282,29 +279,25 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
    enum {
      SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth,
+      SizeB = ActualCols * MaxDepth
      SizeW = MaxDepth * Traits::WorkSpaceFactor
    };
-    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
+    EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB];
    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
  public:
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
+    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
    {
      this->m_mc = ActualRows;
      this->m_nc = ActualCols;
      this->m_kc = MaxDepth;
      this->m_blockA = m_staticA;
      this->m_blockB = m_staticB;
      this->m_blockW = m_staticW;
    }
    inline void allocateA() {}
    inline void allocateB() {}
    inline void allocateW() {}
    inline void allocateAll() {}
 };
@ -323,20 +316,28 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
    DenseIndex m_sizeA;
    DenseIndex m_sizeB;
    DenseIndex m_sizeW;
  public:
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
    {
      this->m_mc = Transpose ? cols : rows;
      this->m_nc = Transpose ? rows : cols;
      this->m_kc = depth;
-      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
+      if(full_rows)
      {
        DenseIndex m = this->m_mc;
        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
      }
      else // full columns
      {
        DenseIndex n = this->m_nc;
        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
      }
      m_sizeA = this->m_mc * this->m_kc;
      m_sizeB = this->m_kc * this->m_nc;
      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
    }
    void allocateA()
@ -351,24 +352,16 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
    }
    void allocateW()
    {
      if(this->m_blockW==0)
        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
    }
    void allocateAll()
    {
      allocateA();
      allocateB();
      allocateW();
    }
    ~gemm_blocking_space()
    {
      aligned_delete(this->m_blockA, m_sizeA);
      aligned_delete(this->m_blockB, m_sizeB);
      aligned_delete(this->m_blockW, m_sizeW);
    }
 };
@ -395,6 +388,36 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
    }
    template<typename Dest>
    inline void evalTo(Dest& dst) const
    {
      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
        dst.noalias() = m_lhs .lazyProduct( m_rhs );
      else
      {
        dst.setZero();
        scaleAndAddTo(dst,Scalar(1));
      }
    }
    template<typename Dest>
    inline void addTo(Dest& dst) const
    {
      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
        dst.noalias() += m_lhs .lazyProduct( m_rhs );
      else
        scaleAndAddTo(dst,Scalar(1));
    }
    template<typename Dest>
    inline void subTo(Dest& dst) const
    {
      if((m_rhs.rows()+dst.rows()+dst.cols())<20 && m_rhs.rows()>0)
        dst.noalias() -= m_lhs .lazyProduct( m_rhs );
      else
        scaleAndAddTo(dst,Scalar(-1));
    }
    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
    {
      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
@ -417,7 +440,7 @@ class GeneralProduct<Lhs, Rhs, GemmProduct>
          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
    }
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    if(mc > Traits::nr)
      mc = (mc/Traits::nr)*Traits::nr;
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    std::size_t sizeB = sizeW + kc*size;
    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
    RhsScalar* blockB = allocatedBlockB + sizeW;
    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
        //  3 - after the diagonal => processed with gebp or skipped
        if (UpLo==Lower)
          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+               -1, -1, 0, 0);
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
        if (UpLo==Upper)
        {
          Index j2 = i2+actual_mc;
          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+               -1, -1, 0, 0);
        }
      }
    }
@ -136,7 +133,7 @@ struct tribb_kernel
  enum {
    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
  };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
+  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
  {
    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
@ -150,7 +147,7 @@ struct tribb_kernel
      if(UpLo==Upper)
        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);
      // selfadjoint micro block
      {
@ -158,7 +155,7 @@ struct tribb_kernel
        buffer.setZero();
        // 1 - apply the kernel on the temporary buffer
        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);
        // 2 - triangular accumulation
        for(Index j1=0; j1<actualBlockSize; ++j1)
        {
@ -173,7 +170,7 @@ struct tribb_kernel
      {
        Index i = j+actualBlockSize;
        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+                    -1, -1, 0, 0);
      }
    }
  }
@ -278,6 +275,8 @@ template<typename MatrixType, unsigned int UpLo>
 template<typename ProductDerived, typename _Lhs, typename _Rhs>
 TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
 {
  eigen_assert(m_matrix.rows() == prod.rows() && m_matrix.cols() == prod.cols());
  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
  return *this;
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@ -141,6 +141,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
  else if(LhsPacketSize > 4)
  {
    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
    // Currently, it seems to be better to perform unaligned loads anyway
    alignmentPattern = NoneAligned;
  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
@ -405,6 +411,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
    alignedSize = 0;
    alignedStart = 0;
  }
  else if(LhsPacketSize > 4)
  {
    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
    alignmentPattern = NoneAligned;
  }
  else if (LhsPacketSize>1)
  {
    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
@ -442,7 +453,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
  {
-    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
    // this helps the compiler generating good binary code
@ -551,7 +562,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
  {
    for (Index i=start; i<end; ++i)
    {
-      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
      const LhsScalar* lhs0 = lhs + i*lhsStride;
      // process first unaligned result's coeffs
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h
@ -73,13 +73,13 @@ namespace internal {
 template<typename Index> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
  int volatile sync;
  int volatile users;
-  Index rhs_start;
+  Index lhs_start;
-  Index rhs_length;
+  Index lhs_length;
 };
 template<bool Condition, typename Functor, typename Index>
@ -107,7 +107,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
  if((!Condition) || (omp_get_num_threads()>1))
    return func(0,rows, 0,cols);
-  Index size = transpose ? cols : rows;
+  Index size = transpose ? rows : cols;
  // 2- compute the maximal number of threads from the size of the product:
  // FIXME this has to be fine tuned
@ -126,26 +126,25 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
    std::swap(rows,cols);
  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads) & ~Index(0x7);
+  Index blockRows = (rows / threads);
  blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
-  #pragma omp parallel for schedule(static,1) num_threads(threads)
+  #pragma omp parallel num_threads(threads)
  for(Index i=0; i<threads; ++i)
  {
    Index i = omp_get_thread_num();
    Index r0 = i*blockRows;
    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
    Index c0 = i*blockCols;
    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
-    info[i].rhs_start = c0;
+    info[i].lhs_start = r0;
-    info[i].rhs_length = actualBlockCols;
+    info[i].lhs_length = actualBlockRows;
-    if(transpose)
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
-      func(0, cols, r0, actualBlockRows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
    else
      func(r0, actualBlockRows, 0,cols, info);
  }
  delete[] info;
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
 struct symm_pack_lhs
 {
  template<int BlockRows> inline
@ -45,25 +45,32 @@ struct symm_pack_lhs
  }
  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
  {
    enum { PacketSize = packet_traits<Scalar>::size };
    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
    Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
    for(Index i=0; i<peeled_mc; i+=Pack1)
    {
      pack<Pack1>(blockA, lhs, cols, i, count);
    }
-    if(rows-peeled_mc>=Pack2)
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
-    {
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
-      peeled_mc += Pack2;
+    
-    }
+    if(Pack1>=3*PacketSize)
      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
        pack<3*PacketSize>(blockA, lhs, cols, i, count);
    if(Pack1>=2*PacketSize)
      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
        pack<2*PacketSize>(blockA, lhs, cols, i, count);
    if(Pack1>=1*PacketSize)
      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
        pack<1*PacketSize>(blockA, lhs, cols, i, count);
    // do the same with mr==1
-    for(Index i=peeled_mc; i<rows; i++)
+    for(Index i=peeled_mc1; i<rows; i++)
    {
      for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);              // normal
+        blockA[count++] = lhs(i, k);                   // normal
      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
@ -82,7 +89,8 @@ struct symm_pack_rhs
    Index end_k = k2 + rows;
    Index count = 0;
    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols = (cols/nr)*nr;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
    // first part: normal case
    for(Index j2=0; j2<k2; j2+=nr)
@ -91,79 +99,151 @@ struct symm_pack_rhs
      {
        blockB[count+0] = rhs(k,j2+0);
        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        if (nr>=4)
        {
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
        }
        if (nr>=8)
        {
          blockB[count+4] = rhs(k,j2+4);
          blockB[count+5] = rhs(k,j2+5);
          blockB[count+6] = rhs(k,j2+6);
          blockB[count+7] = rhs(k,j2+7);
        }
        count += nr;
      }
    }
    // second part: diagonal block
-    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
    if(nr>=8)
    {
-      // again we can split vertically in three different parts (transpose, symmetric, normal)
+      for(Index j2=k2; j2<end8; j2+=8)
      // transpose
      for(Index k=k2; k<j2; k++)
      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
+        // transpose
-        if (nr==4)
+        for(Index k=k2; k<j2; k++)
        {
          blockB[count+0] = numext::conj(rhs(j2+0,k));
          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
          blockB[count+4] = numext::conj(rhs(j2+4,k));
          blockB[count+5] = numext::conj(rhs(j2+5,k));
          blockB[count+6] = numext::conj(rhs(j2+6,k));
          blockB[count+7] = numext::conj(rhs(j2+7,k));
          count += 8;
        }
-        count += nr;
+        // symmetric
-      }
+        Index h = 0;
-      // symmetric
+        for(Index k=j2; k<j2+8; k++)
      Index h = 0;
      for(Index k=j2; k<j2+nr; k++)
      {
        // normal
        for (Index w=0 ; w<h; ++w)
          blockB[count+w] = rhs(k,j2+w);
        blockB[count+h] = numext::real(rhs(k,k));
        // transpose
        for (Index w=h+1 ; w<nr; ++w)
          blockB[count+w] = numext::conj(rhs(j2+w,k));
        count += nr;
        ++h;
      }
      // normal
      for(Index k=j2+nr; k<end_k; k++)
      {
        blockB[count+0] = rhs(k,j2+0);
        blockB[count+1] = rhs(k,j2+1);
        if (nr==4)
        {
          // normal
          for (Index w=0 ; w<h; ++w)
            blockB[count+w] = rhs(k,j2+w);
          blockB[count+h] = numext::real(rhs(k,k));
          // transpose
          for (Index w=h+1 ; w<8; ++w)
            blockB[count+w] = numext::conj(rhs(j2+w,k));
          count += 8;
          ++h;
        }
        // normal
        for(Index k=j2+8; k<end_k; k++)
        {
          blockB[count+0] = rhs(k,j2+0);
          blockB[count+1] = rhs(k,j2+1);
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
          blockB[count+4] = rhs(k,j2+4);
          blockB[count+5] = rhs(k,j2+5);
          blockB[count+6] = rhs(k,j2+6);
          blockB[count+7] = rhs(k,j2+7);
          count += 8;
        }
      }
    }
    if(nr>=4)
    {
      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
      {
        // again we can split vertically in three different parts (transpose, symmetric, normal)
        // transpose
        for(Index k=k2; k<j2; k++)
        {
          blockB[count+0] = numext::conj(rhs(j2+0,k));
          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
          count += 4;
        }
        // symmetric
        Index h = 0;
        for(Index k=j2; k<j2+4; k++)
        {
          // normal
          for (Index w=0 ; w<h; ++w)
            blockB[count+w] = rhs(k,j2+w);
          blockB[count+h] = numext::real(rhs(k,k));
          // transpose
          for (Index w=h+1 ; w<4; ++w)
            blockB[count+w] = numext::conj(rhs(j2+w,k));
          count += 4;
          ++h;
        }
        // normal
        for(Index k=j2+4; k<end_k; k++)
        {
          blockB[count+0] = rhs(k,j2+0);
          blockB[count+1] = rhs(k,j2+1);
          blockB[count+2] = rhs(k,j2+2);
          blockB[count+3] = rhs(k,j2+3);
          count += 4;
        }
        count += nr;
      }
    }
    // third part: transposed
-    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
+    if(nr>=8)
    {
-      for(Index k=k2; k<end_k; k++)
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
+        for(Index k=k2; k<end_k; k++)
        blockB[count+1] = numext::conj(rhs(j2+1,k));
        if (nr==4)
        {
          blockB[count+0] = numext::conj(rhs(j2+0,k));
          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
          blockB[count+4] = numext::conj(rhs(j2+4,k));
          blockB[count+5] = numext::conj(rhs(j2+5,k));
          blockB[count+6] = numext::conj(rhs(j2+6,k));
          blockB[count+7] = numext::conj(rhs(j2+7,k));
          count += 8;
        }
      }
    }
    if(nr>=4)
    {
      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
      {
        for(Index k=k2; k<end_k; k++)
        {
          blockB[count+0] = numext::conj(rhs(j2+0,k));
          blockB[count+1] = numext::conj(rhs(j2+1,k));
          blockB[count+2] = numext::conj(rhs(j2+2,k));
          blockB[count+3] = numext::conj(rhs(j2+3,k));
          count += 4;
        }
        count += nr;
      }
    }
    // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
    {
      // transpose
      Index half = (std::min)(end_k,j2);
@ -261,11 +341,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    // kc must smaller than mc
    kc = (std::min)(kc,mc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*cols;
    std::size_t sizeB = sizeW + kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;
    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@ -348,11 +427,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    Index mc = rows;  // cache block size along the M direction
    Index nc = cols;  // cache block size along the N direction
    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*cols;
    std::size_t sizeB = sizeW + kc*cols;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;
    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
@ -423,11 +501,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
      ::run(
-        lhs.rows(), rhs.cols(),                 // sizes
+        lhs.rows(), rhs.cols(),                     // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
+        &lhs.coeffRef(0,0),    lhs.outerStride(),   // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
+        &rhs.coeffRef(0,0),    rhs.outerStride(),   // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        &dst.coeffRef(0,0), dst.outerStride(),      // result info
-        actualAlpha                             // alpha
+        actualAlpha                                 // alpha
      );
  }
 };
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
          // GEBP with remaining micro panel
          if (lengthTarget>0)
@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
          }
        }
      }
@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
        }
      }
    }
@ -265,12 +263,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
+    std::size_t sizeB = kc*cols+EIGEN_ALIGN_BYTES/sizeof(Scalar);
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
    triangularBuffer.setZero();
@ -304,6 +300,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
      Scalar* geb = blockB+ts*ts;
      geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
@ -357,14 +354,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                        actual_mc, panelLength, actualPanelWidth,
                        alpha,
                        actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset,// offsets
+                        blockOffset, blockOffset);// offsets
                        blockW); // workspace
          }
        }
        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
                    blockA, geb, actual_mc, actual_kc, rs,
                    alpha,
-                    -1, -1, 0, 0, blockW);
+                    -1, -1, 0, 0);
      }
    }
  }
--- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*cols;
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
    conj_if<Conjugate> conj;
    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
          }
        }
      }
@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
          {
            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
+            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
          }
        }
      }
@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
    std::size_t sizeA = kc*mc;
    std::size_t sizeB = kc*size;
    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
    conj_if<Conjugate> conj;
    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
                          actual_mc, panelLength, actualPanelWidth,
                          Scalar(-1),
                          actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset, // offsets
+                          panelOffset, panelOffset); // offsets
                          blockW);  // workspace
            }
            // unblocked triangular solve
@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
        if (rs>0)
          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
                      actual_mc, actual_kc, rs, Scalar(-1),
-                      -1, -1, 0, 0, blockW);
+                      -1, -1, 0, 0);
      }
    }
  }
--- a/Eigen/src/Core/util/MKL_support.h
+++ b/Eigen/src/Core/util/MKL_support.h
@ -54,8 +54,25 @@
 #endif
 #if defined EIGEN_USE_MKL
 #   include <mkl.h> 
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
 #   ifndef INTEL_MKL_VERSION
 #       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
 #   elif INTEL_MKL_VERSION < 100305    /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
 #       undef EIGEN_USE_MKL
 #   endif
 #   ifndef EIGEN_USE_MKL
    /*If the MKL version is too old, undef everything*/
 #       undef   EIGEN_USE_MKL_ALL
 #       undef   EIGEN_USE_BLAS
 #       undef   EIGEN_USE_LAPACKE
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
 #       undef   EIGEN_USE_LAPACKE
 #   endif
 #endif
-#include <mkl.h>
+#if defined EIGEN_USE_MKL
 #include <mkl_lapacke.h>
 #define EIGEN_MKL_VML_THRESHOLD 128
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@ -66,13 +66,24 @@
  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
 #endif
 // Defined the boundary (in bytes) on which the data needs to be aligned. Note
 // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
 // aligned at all regardless of the value of this #define.
 #define EIGEN_ALIGN_BYTES 16
 #ifdef EIGEN_DONT_ALIGN
  #ifndef EIGEN_DONT_ALIGN_STATICALLY
    #define EIGEN_DONT_ALIGN_STATICALLY
  #endif
  #define EIGEN_ALIGN 0
-#else
+#elif !defined(EIGEN_DONT_VECTORIZE)
  #if defined(__AVX__)
    #undef EIGEN_ALIGN_BYTES
    #define EIGEN_ALIGN_BYTES 32
  #endif
  #define EIGEN_ALIGN 1
 #else
  #define EIGEN_ALIGN 0
 #endif
 // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
@ -286,13 +297,19 @@ namespace Eigen {
 #endif
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
 #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
 #define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES)
 #if EIGEN_ALIGN_STATICALLY
 #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
 #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
 #define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
 #define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT
 #else
 #define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
 #define EIGEN_USER_ALIGN16
 #define EIGEN_USER_ALIGN32
 #define EIGEN_USER_ALIGN_DEFAULT
 #endif
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
@ -326,13 +343,13 @@ namespace Eigen {
 #elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
  template <typename OtherDerived> \
-  EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
 #else
 #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
  { \
    Base::operator=(other); \
    return *this; \
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -32,7 +32,7 @@
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16)
  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@ -42,14 +42,14 @@
 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16)
  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
 #else
  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
 #endif
-#if defined(__APPLE__) \
+#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \
- || defined(_WIN64) \
+ || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16))   \
 || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
 || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
@ -73,7 +73,7 @@
  #define EIGEN_HAS_POSIX_MEMALIGN 0
 #endif
-#ifdef EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
  #define EIGEN_HAS_MM_MALLOC 1
 #else
  #define EIGEN_HAS_MM_MALLOC 0
@ -89,7 +89,7 @@ inline void throw_std_bad_alloc()
  #ifdef EIGEN_EXCEPTIONS
    throw std::bad_alloc();
  #else
-    std::size_t huge = -1;
+    std::size_t huge = static_cast<std::size_t>(-1);
    new int[huge];
  #endif
 }
@ -105,9 +105,9 @@ inline void throw_std_bad_alloc()
  */
 inline void* handmade_aligned_malloc(std::size_t size)
 {
-  void *original = std::malloc(size+16);
+  void *original = std::malloc(size+EIGEN_ALIGN_BYTES);
  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
  *(reinterpret_cast<void**>(aligned) - 1) = original;
  return aligned;
 }
@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
  if (ptr == 0) return handmade_aligned_malloc(size);
  void *original = *(reinterpret_cast<void**>(ptr) - 1);
  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+16);
+  original = std::realloc(original,size+EIGEN_ALIGN_BYTES);
  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES);
  void *previous_aligned = static_cast<char *>(original)+previous_offset;
  if(aligned!=previous_aligned)
    std::memmove(aligned, previous_aligned, size);
@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed()
 {}
 #endif
-/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
  * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
  */
 inline void* aligned_malloc(size_t size)
@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size)
  #elif EIGEN_MALLOC_ALREADY_ALIGNED
    result = std::malloc(size);
  #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, 16, size)) result = 0;
+    if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0;
  #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, 16);
+    result = _mm_malloc(size, EIGEN_ALIGN_BYTES);
  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    result = _aligned_malloc(size, 16);
+    result = _aligned_malloc(size, EIGEN_ALIGN_BYTES);
  #else
    result = handmade_aligned_malloc(size);
  #endif
@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
  // functions. This may not always be the case and we just try to be safe.
  #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,16);
+    result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
  #else
    result = generic_aligned_realloc(ptr,new_size,old_size);
  #endif
 #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-  result = _aligned_realloc(ptr,new_size,16);
+  result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES);
 #else
  result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@ -552,7 +552,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
 #ifndef EIGEN_ALLOCA
-  #if (defined __linux__)
+  #if (defined __linux__) || (defined __APPLE__)
    #define EIGEN_ALLOCA alloca
  #elif defined(_MSC_VER)
    #define EIGEN_ALLOCA _alloca
@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler
  * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
  */
 #ifdef EIGEN_ALLOCA
-
+  // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
-  #if defined(__arm__) || defined(_WIN32)
+#if defined(__arm__) || defined(_WIN32) || EIGEN_ALIGN_BYTES > 16
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES)
  #else
    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
  #endif
@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0)))
 /****************************************************************************/
@ -700,98 +700,42 @@ template<typename T> class aligned_stack_memory_handler
 * \sa \ref TopicStlContainers.
 */
 template<class T>
-class aligned_allocator
+class aligned_allocator : public std::allocator<T>
 {
 public:
-    typedef size_t    size_type;
+  typedef size_t          size_type;
-    typedef std::ptrdiff_t difference_type;
+  typedef std::ptrdiff_t  difference_type;
-    typedef T*        pointer;
+  typedef T*              pointer;
-    typedef const T*  const_pointer;
+  typedef const T*        const_pointer;
-    typedef T&        reference;
+  typedef T&              reference;
-    typedef const T&  const_reference;
+  typedef const T&        const_reference;
-    typedef T         value_type;
+  typedef T               value_type;
-    template<class U>
+  template<class U>
-    struct rebind
+  struct rebind
-    {
+  {
-        typedef aligned_allocator<U> other;
+    typedef aligned_allocator<U> other;
-    };
+  };
-    pointer address( reference value ) const
+  aligned_allocator() : std::allocator<T>() {}
    {
        return &value;
    }
-    const_pointer address( const_reference value ) const
+  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
    {
        return &value;
    }
-    aligned_allocator()
+  template<class U>
-    {
+  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
    }
-    aligned_allocator( const aligned_allocator& )
+  ~aligned_allocator() {}
    {
    }
-    template<class U>
+  pointer allocate(size_type num, const void* /*hint*/ = 0)
-    aligned_allocator( const aligned_allocator<U>& )
+  {
-    {
+    internal::check_size_for_overflow<T>(num);
-    }
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
  }
-    ~aligned_allocator()
+  void deallocate(pointer p, size_type /*num*/)
-    {
+  {
-    }
+    internal::aligned_free(p);
-
+  }
    size_type max_size() const
    {
        return (std::numeric_limits<size_type>::max)();
    }
    pointer allocate( size_type num, const void* hint = 0 )
    {
        EIGEN_UNUSED_VARIABLE(hint);
        internal::check_size_for_overflow<T>(num);
        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
    }
    void construct( pointer p, const T& value )
    {
        ::new( p ) T( value );
    }
 #if (__cplusplus >= 201103L)
    template <typename U, typename... Args>
    void construct( U* u, Args&&... args)
    {
        ::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... );
    }
 #endif
    void destroy( pointer p )
    {
        p->~T();
    }
 #if (__cplusplus >= 201103L)
    template <typename U>
    void destroy( U* u )
    {
        u->~U();
    }
 #endif
    void deallocate( pointer p, size_type /*num*/ )
    {
        internal::aligned_free( p );
    }
    bool operator!=(const aligned_allocator<T>& ) const
    { return false; }
    bool operator==(const aligned_allocator<T>& ) const
    { return true; }
 };
 //---------- Cache sizes ----------
@ -823,9 +767,9 @@ namespace internal {
 #ifdef EIGEN_CPUID
-inline bool cpuid_is_vendor(int abcd[4], const char* vendor)
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
 {
-  return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2];
+  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
 }
 inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
@ -967,13 +911,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3)
 {
  #ifdef EIGEN_CPUID
  int abcd[4];
  const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
  const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
  // identify the CPU vendor
  EIGEN_CPUID(abcd,0x0,0);
  int max_std_funcs = abcd[1];
-  if(cpuid_is_vendor(abcd,"GenuineIntel"))
+  if(cpuid_is_vendor(abcd,GenuineIntel))
    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
-  else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!"))
+  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
    queryCacheSizes_amd(l1,l2,l3);
  else
    // by default let's use Intel's API
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@ -101,6 +101,7 @@ template<typename T> struct packet_traits;
 template<typename T> struct unpacket_traits
 {
  typedef T type;
  typedef T half;
  enum {size=1};
 };
@ -137,7 +138,7 @@ class compute_matrix_flags
            ((Options&DontAlign)==0)
        && (
 #if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
+             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0))
 #else
             0
 #endif
--- a/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/Eigen/src/Eigenvalues/EigenSolver.h
@ -275,10 +275,11 @@ template<typename _MatrixType> class EigenSolver
      */
    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
    ComputationInfo info() const
    {
      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_realSchur.info();
+      return m_info;
    }
    /** \brief Sets the maximum number of iterations allowed. */
@ -302,6 +303,7 @@ template<typename _MatrixType> class EigenSolver
    EigenvalueType m_eivalues;
    bool m_isInitialized;
    bool m_eigenvectorsOk;
    ComputationInfo m_info;
    RealSchur<MatrixType> m_realSchur;
    MatrixType m_matT;
@ -366,12 +368,16 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
 {
  using std::sqrt;
  using std::abs;
  using std::max;
  using numext::isfinite;
  eigen_assert(matrix.cols() == matrix.rows());
  // Reduce to real Schur form.
  m_realSchur.compute(matrix, computeEigenvectors);
-  if (m_realSchur.info() == Success)
+  m_info = m_realSchur.info();
  if (m_info == Success)
  {
    m_matT = m_realSchur.matrixT();
    if (computeEigenvectors)
@ -385,14 +391,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
      if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
      {
        m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
        if(!isfinite(m_eivalues.coeffRef(i)))
        {
          m_isInitialized = true;
          m_eigenvectorsOk = false;
          m_info = NumericalIssue;
          return *this;
        }
        ++i;
      }
      else
      {
        Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        Scalar z;
        // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
        // without overflow
        {
          Scalar t0 = m_matT.coeff(i+1, i);
          Scalar t1 = m_matT.coeff(i, i+1);
          Scalar maxval = (max)(abs(p),(max)(abs(t0),abs(t1)));
          t0 /= maxval;
          t1 /= maxval;
          Scalar p0 = p/maxval;
          z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
        }
        m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
        m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
        if(!(isfinite(m_eivalues.coeffRef(i)) && isfinite(m_eivalues.coeffRef(i+1))))
        {
          m_isInitialized = true;
          m_eigenvectorsOk = false;
          m_info = NumericalIssue;
          return *this;
        }
        i += 2;
      }
    }
@ -581,7 +613,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
    }
    else
    {
-      eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
+      eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen
    }
  }
--- a/Eigen/src/Geometry/AngleAxis.h
+++ b/Eigen/src/Geometry/AngleAxis.h
@ -77,7 +77,9 @@ public:
    *          represents an invalid rotation. */
  template<typename Derived>
  inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
    * This function implicitly normalizes the quaternion \a q.
    */
  template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
  template<typename Derived>
@ -149,29 +151,27 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 /** Set \c *this from a \b unit quaternion.
-  * The axis is normalized.
+  * The resulting axis is normalized.
  * 
-  * \warning As any other method dealing with quaternion, if the input quaternion
+  * This function implicitly normalizes the quaternion \a q.
  *          is not normalized then the result is undefined.
  */
 template<typename Scalar>
 template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
-  using std::acos;
+  using std::atan2;
-  EIGEN_USING_STD_MATH(min);
+  Scalar n = q.vec().norm();
-  EIGEN_USING_STD_MATH(max);
+  if(n<NumTraits<Scalar>::epsilon())
-  using std::sqrt;
+    n = q.vec().stableNorm();
-  Scalar n2 = q.vec().squaredNorm();
+  if (n > Scalar(0))
  if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
  {
-    m_angle = 0;
+    m_angle = Scalar(2)*atan2(n, q.w());
-    m_axis << 1, 0, 0;
+    m_axis  = q.vec() / n;
  }
  else
  {
-    m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
+    m_angle = 0;
-    m_axis = q.vec() / sqrt(n2);
+    m_axis << 1, 0, 0;
  }
  return *this;
 }
--- a/Eigen/src/Geometry/Quaternion.h
+++ b/Eigen/src/Geometry/Quaternion.h
@ -587,7 +587,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
  //    which yields a singular value problem
  if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
  {
-    c = max<Scalar>(c,-1);
+    c = (max)(c,Scalar(-1));
    Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
    JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
    Vector3 axis = svd.matrixV().col(2);
--- a/Eigen/src/Geometry/Transform.h
+++ b/Eigen/src/Geometry/Transform.h
@ -194,9 +194,9 @@ public:
  /** type of the matrix used to represent the linear part of the transformation */
  typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
  /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact)> LinearPart;
+  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
  /** type of read reference to the linear part of the transformation */
-  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact)> ConstLinearPart;
+  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
  /** type of read/write reference to the affine part of the transformation */
  typedef typename internal::conditional<int(Mode)==int(AffineCompact),
                              MatrixType&,
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@ -113,7 +113,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
  const Index n = src.cols(); // number of measurements
  // required for demeaning ...
-  const RealScalar one_over_n = 1 / static_cast<RealScalar>(n);
+  const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
  // computation of mean
  const VectorType src_mean = src.rowwise().sum() * one_over_n;
@ -136,16 +136,16 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
  // Eq. (39)
  VectorType S = VectorType::Ones(m);
-  if (sigma.determinant()<0) S(m-1) = -1;
+  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
  // Eq. (40) and (43)
  const VectorType& d = svd.singularValues();
  Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
  if (rank == m-1) {
-    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > 0 ) {
+    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
      Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
    } else {
-      const Scalar s = S(m-1); S(m-1) = -1;
+      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
      Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
      S(m-1) = s;
    }
@ -156,7 +156,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
  if (with_scaling)
  {
    // Eq. (42)
-    const Scalar c = 1/src_var * svd.singularValues().dot(S);
+    const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
    // Eq. (41)
    Rt.col(m).head(m) = dst_mean;
--- a/Eigen/src/Householder/BlockHouseholder.h
+++ b/Eigen/src/Householder/BlockHouseholder.h
@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
  typedef typename MatrixType::Index Index;
  enum { TFactorSize = MatrixType::ColsAtCompileTime };
  Index nbVecs = vectors.cols();
-  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs);
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
  make_block_householder_triangular_factor(T, vectors, hCoeffs);
  const TriangularView<const VectorsType, UnitLower>& V(vectors);
--- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@ -65,10 +65,10 @@ class DiagonalPreconditioner
      {
        typename MatType::InnerIterator it(mat,j);
        while(it && it.index()!=j) ++it;
-        if(it && it.index()==j)
+        if(it && it.index()==j && it.value()!=Scalar(0))
          m_invdiag(j) = Scalar(1)/it.value();
        else
-          m_invdiag(j) = 0;
+          m_invdiag(j) = Scalar(1);
      }
      m_isInitialized = true;
      return *this;
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@ -29,10 +29,11 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
  *
  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
  *
-  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A
+  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
-  * is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q
+  * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
-  * are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal
+  * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
-  * coefficients) of U are sorted in such a way that any zeros are at the end.
+  * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
  * zeros are at the end.
  *
  * This decomposition provides the generic approach to solving systems of linear equations, computing
  * the rank, invertibility, inverse, kernel, and determinant.
@ -546,8 +547,8 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant
 }
 /** \returns the matrix represented by the decomposition,
- * i.e., it returns the product: P^{-1} L U Q^{-1}.
+ * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
- * This function is provided for debug purpose. */
+ * This function is provided for debug purposes. */
 template<typename MatrixType>
 MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
 {
--- a/Eigen/src/OrderingMethods/Ordering.h
+++ b/Eigen/src/OrderingMethods/Ordering.h
@ -109,7 +109,7 @@ class NaturalOrdering
  * \class COLAMDOrdering
  *
  * Functor computing the \em column \em approximate \em minimum \em degree ordering 
-  * The matrix should be in column-major format
+  * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
  */
 template<typename Index>
 class COLAMDOrdering
@ -118,10 +118,14 @@ class COLAMDOrdering
    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
    typedef Matrix<Index, Dynamic, 1> IndexVector;
-    /** Compute the permutation vector form a sparse matrix */
+    /** Compute the permutation vector \a perm form the sparse matrix \a mat
      * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
      */
    template <typename MatrixType>
    void operator() (const MatrixType& mat, PermutationType& perm)
    {
      eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
      Index m = mat.rows();
      Index n = mat.cols();
      Index nnz = mat.nonZeros();
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@ -415,6 +415,7 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
                         JacobiRotation<RealScalar> *j_right)
 {
  using std::sqrt;
  using std::abs;
  Matrix<RealScalar,2,2> m;
  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
@ -428,9 +429,11 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
  }
  else
  {
-    RealScalar u = d / t;
+    RealScalar t2d2 = numext::hypot(t,d);
-    rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.c() = abs(t)/t2d2;
-    rot1.s() = rot1.c() * u;
+    rot1.s() = d/t2d2;
    if(t<RealScalar(0))
      rot1.s() = -rot1.s();
  }
  m.applyOnTheLeft(0,1,rot1);
  j_right->makeJacobi(m,0,1);
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@ -338,7 +338,10 @@ const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::inner
 namespace internal {
 template< typename XprType, int BlockRows, int BlockCols, bool InnerPanel,
-          bool OuterVector =  (BlockCols==1 && XprType::IsRowMajor) || (BlockRows==1 && !XprType::IsRowMajor)>
+          bool OuterVector =  (BlockCols==1 && XprType::IsRowMajor)
                               | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
                                 // revert to || as soon as not needed anymore. 
                              (BlockRows==1 && !XprType::IsRowMajor)>
 class GenericSparseBlockInnerIteratorImpl;
 }
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@ -1205,7 +1205,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
    m_data.value(p) = m_data.value(p-1);
    --p;
  }
-  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
+  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
  m_innerNonZeros[outer]++;
--- a/Eigen/src/SparseQR/SparseQR.h
+++ b/Eigen/src/SparseQR/SparseQR.h
@ -58,6 +58,7 @@ namespace internal {
  * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
  *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
  * 
  * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
  * 
  */
 template<typename _MatrixType, typename _OrderingType>
@ -77,10 +78,23 @@ class SparseQR
    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
    { }
    /** Construct a QR factorization of the matrix \a mat.
      * 
      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
      * 
      * \sa compute()
      */
    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
    {
      compute(mat);
    }
    /** Computes the QR factorization of the sparse matrix \a mat.
      * 
      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
      * 
      * \sa analyzePattern(), factorize()
      */
    void compute(const MatrixType& mat)
    {
      analyzePattern(mat);
@ -255,6 +269,8 @@ class SparseQR
 };
 /** \brief Preprocessing step of a QR factorization 
  * 
  * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
  * 
  * In this step, the fill-reducing permutation is computed and applied to the columns of A
  * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
@ -264,6 +280,7 @@ class SparseQR
 template <typename MatrixType, typename OrderingType>
 void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
 {
  eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
  // Compute the column fill reducing ordering
  OrderingType ord; 
  ord(mat, m_perm_c); 
--- a/Eigen/src/StlSupport/StdDeque.h
+++ b/Eigen/src/StlSupport/StdDeque.h
@ -11,7 +11,7 @@
 #ifndef EIGEN_STDDEQUE_H
 #define EIGEN_STDDEQUE_H
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 // Define the explicit instantiation (e.g. necessary for the Intel compiler)
 #if defined(__INTEL_COMPILER) || defined(__GNUC__)
--- a/Eigen/src/StlSupport/StdList.h
+++ b/Eigen/src/StlSupport/StdList.h
@ -10,7 +10,7 @@
 #ifndef EIGEN_STDLIST_H
 #define EIGEN_STDLIST_H
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 // Define the explicit instantiation (e.g. necessary for the Intel compiler)
 #if defined(__INTEL_COMPILER) || defined(__GNUC__)
--- a/Eigen/src/StlSupport/StdVector.h
+++ b/Eigen/src/StlSupport/StdVector.h
@ -11,7 +11,7 @@
 #ifndef EIGEN_STDVECTOR_H
 #define EIGEN_STDVECTOR_H
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 /**
 * This section contains a convenience MACRO which allows an easy specialization of
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@ -141,6 +141,18 @@ tan() const
  return derived();
 }
 /** \returns an expression of the coefficient-wise arc tan of *this.
  *
  * Example: \include Cwise_atan.cpp
  * Output: \verbinclude Cwise_atan.out
  *
  * \sa cos(), sin(), tan()
  */
 inline const CwiseUnaryOp<internal::scalar_atan_op<Scalar>, Derived>
 atan() const
 {
  return derived();
 }
 /** \returns an expression of the coefficient-wise power of *this to the given exponent.
  *
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.**
 For more information go to http://eigen.tuxfamily.org/.
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@ -2,6 +2,14 @@
 // g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2  ./a.out
 // icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp  && OMP_NUM_THREADS=2  ./a.out
 // Compilation options:
 // 
 // -DSCALAR=std::complex<double>
 // -DSCALARA=double or -DSCALARB=double
 // -DHAVE_BLAS
 // -DDECOUPLED
 //
 #include <iostream>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
@ -14,10 +22,18 @@ using namespace Eigen;
 #define SCALAR float
 #endif
 #ifndef SCALARA
 #define SCALARA SCALAR
 #endif
 #ifndef SCALARB
 #define SCALARB SCALAR
 #endif
 typedef SCALAR Scalar;
 typedef NumTraits<Scalar>::Real RealScalar;
-typedef Matrix<RealScalar,Dynamic,Dynamic> A;
+typedef Matrix<SCALARA,Dynamic,Dynamic> A;
-typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
+typedef Matrix<SCALARB,Dynamic,Dynamic> B;
 typedef Matrix<Scalar,Dynamic,Dynamic> C;
 typedef Matrix<RealScalar,Dynamic,Dynamic> M;
@ -129,35 +145,62 @@ int main(int argc, char ** argv)
  int tries = 2;  // number of tries, we keep the best
  int s = 2048;
  int m = s;
  int n = s;
  int p = s;
  int cache_size = -1;
  bool need_help = false;
-  for (int i=1; i<argc; ++i)
+  for (int i=1; i<argc;)
  {
-    if(argv[i][0]=='s')
+    if(argv[i][0]=='-')
-      s = atoi(argv[i]+1);
+    {
-    else if(argv[i][0]=='c')
+      if(argv[i][1]=='s')
-      cache_size = atoi(argv[i]+1);
+      {
-    else if(argv[i][0]=='t')
+        ++i;
-      tries = atoi(argv[i]+1);
+        s = atoi(argv[i++]);
-    else if(argv[i][0]=='p')
+        m = n = p = s;
-      rep = atoi(argv[i]+1);
+        if(argv[i][0]!='-')
        {
          n = atoi(argv[i++]);
          p = atoi(argv[i++]);
        }
      }
      else if(argv[i][1]=='c')
      {
        ++i;
        cache_size = atoi(argv[i++]);
      }
      else if(argv[i][1]=='t')
      {
        ++i;
        tries = atoi(argv[i++]);
      }
      else if(argv[i][1]=='p')
      {
        ++i;
        rep = atoi(argv[i++]);
      }
    }
    else
    {
      need_help = true;
      break;
    }
  }
  if(need_help)
  {
-    std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
+    std::cout << argv[0] << " -s <matrix sizes> -c <cache size> -t <nb tries> -p <nb repeats>\n";
    std::cout << "   <matrix sizes> : size\n";
    std::cout << "   <matrix sizes> : rows columns depth\n";
    return 1;
  }
  if(cache_size>0)
    setCpuCacheSizes(cache_size,96*cache_size);
-  int m = s;
+  
  int n = s;
  int p = s;
  A a(m,p); a.setRandom();
  B b(p,n); b.setRandom();
  C c(m,n); c.setOnes();
@ -172,6 +215,7 @@ int main(int argc, char ** argv)
  // check the parallel product is correct
  #if defined EIGEN_HAS_OPENMP
  Eigen::initParallel();
  int procs = omp_get_max_threads();
  if(procs>1)
  {
@ -188,11 +232,20 @@ int main(int argc, char ** argv)
  #elif defined HAVE_BLAS
    blas_gemm(a,b,r);
    c.noalias() += a * b;
-    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+    if(!r.isApprox(c)) {
      std::cout << r  - c << "\n";
      std::cerr << "Warning, your product is crap!\n\n";
    }
  #else
-    gemm(a,b,c);
+    if(1.*m*n*p<2000.*2000*2000)
-    r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
+    {
-    if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+      gemm(a,b,c);
      r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
      if(!r.isApprox(c)) {
        std::cout << r - c << "\n";
        std::cerr << "Warning, your product is crap!\n\n";
      }
    }
  #endif
  #ifdef HAVE_BLAS
@ -214,7 +267,7 @@ int main(int argc, char ** argv)
  {
    BenchTimer tmono;
    omp_set_num_threads(1);
-    Eigen::internal::setNbThreads(1);
+    Eigen::setNbThreads(1);
    c = rc;
    BENCH(tmono, tries, rep, gemm(a,b,c));
    std::cout << "eigen mono cpu    " << tmono.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmono.total(CPU_TIMER)  << "s)\n";
@ -223,6 +276,15 @@ int main(int argc, char ** argv)
  }
  #endif
  if(1.*m*n*p<30*30*30)
  {
      BenchTimer tmt;
      c = rc;
      BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
      std::cout << "lazy cpu         " << tmt.best(CPU_TIMER)/rep  << "s  \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9  <<  " GFLOPS \t(" << tmt.total(CPU_TIMER)  << "s)\n";
      std::cout << "lazy real        " << tmt.best(REAL_TIMER)/rep << "s  \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 <<  " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
  }
  #ifdef DECOUPLED
  if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
  {
--- a/bench/btl/CMakeLists.txt
+++ b/bench/btl/CMakeLists.txt
@ -104,6 +104,7 @@ add_subdirectory(libs/mtl4)
 add_subdirectory(libs/blitz)
 add_subdirectory(libs/tvmet)
 add_subdirectory(libs/STL)
 add_subdirectory(libs/blaze)
 add_subdirectory(data)
--- a/bench/btl/actions/action_axpby.hh
+++ b/bench/btl/actions/action_axpby.hh
@ -33,7 +33,7 @@ class Action_axpby {
 public :
  // Ctor
-  Action_axpby( int size ):_size(size),_alpha(0.5),_beta(0.95)
+  Action_axpby( int size ):_alpha(0.5),_beta(0.95),_size(size)
  {
    MESSAGE("Action_axpby Ctor");
--- a/bench/btl/actions/action_axpy.hh
+++ b/bench/btl/actions/action_axpy.hh
@ -35,7 +35,7 @@ public :
  // Ctor
-  Action_axpy( int size ):_size(size),_coef(1.0)
+  Action_axpy( int size ):_coef(1.0),_size(size)
  {
    MESSAGE("Action_axpy Ctor");
--- a/bench/btl/cmake/FindATLAS.cmake
+++ b/bench/btl/cmake/FindATLAS.cmake
@ -4,10 +4,7 @@ if (ATLAS_LIBRARIES)
 endif (ATLAS_LIBRARIES)
 find_file(ATLAS_LIB libatlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
-find_library(ATLAS_LIB atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
+find_library(ATLAS_LIB satlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_file(ATLAS_CBLAS libcblas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_CBLAS cblas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_file(ATLAS_LAPACK liblapack_atlas.so.3 PATHS /usr/lib $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 find_library(ATLAS_LAPACK lapack_atlas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
@ -22,14 +19,14 @@ find_library(ATLAS_F77BLAS f77blas PATHS $ENV{ATLASDIR} ${LIB_INSTALL_DIR})
 if(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
-  set(ATLAS_LIBRARIES ${ATLAS_LAPACK} ${ATLAS_CBLAS}  ${ATLAS_F77BLAS} ${ATLAS_LIB})
+  set(ATLAS_LIBRARIES ${ATLAS_LAPACK}  ${ATLAS_LIB})
  # search the default lapack lib link to it
  find_file(ATLAS_REFERENCE_LAPACK liblapack.so.3 PATHS /usr/lib /usr/lib64)
  find_library(ATLAS_REFERENCE_LAPACK NAMES lapack)
-  if(ATLAS_REFERENCE_LAPACK)
+#   if(ATLAS_REFERENCE_LAPACK)
-    set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
+#     set(ATLAS_LIBRARIES ${ATLAS_LIBRARIES} ${ATLAS_REFERENCE_LAPACK})
-  endif()
+#   endif()
 endif(ATLAS_LIB AND ATLAS_CBLAS AND ATLAS_LAPACK AND ATLAS_F77BLAS)
--- a/bench/btl/cmake/FindBLAZE.cmake
+++ b/bench/btl/cmake/FindBLAZE.cmake
@ -0,0 +1,31 @@
 # - Try to find eigen2 headers
 # Once done this will define
 #
 #  BLAZE_FOUND - system has blaze lib
 #  BLAZE_INCLUDE_DIR - the blaze include directory
 #
 # Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 # Adapted from FindEigen.cmake:
 # Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
 # Redistribution and use is allowed according to the terms of the BSD license.
 # For details see the accompanying COPYING-CMAKE-SCRIPTS file.
 if (BLAZE_INCLUDE_DIR)
  # in cache already
  set(BLAZE_FOUND TRUE)
 else (BLAZE_INCLUDE_DIR)
 find_path(BLAZE_INCLUDE_DIR NAMES blaze/Blaze.h
     PATHS
     ${INCLUDE_INSTALL_DIR}
   )
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(BLAZE DEFAULT_MSG BLAZE_INCLUDE_DIR)
 mark_as_advanced(BLAZE_INCLUDE_DIR)
 endif(BLAZE_INCLUDE_DIR)
--- a/bench/btl/cmake/FindGOTO.cmake
+++ b/bench/btl/cmake/FindGOTO.cmake
@ -1,15 +0,0 @@
 if (GOTO_LIBRARIES)
  set(GOTO_FIND_QUIETLY TRUE)
 endif (GOTO_LIBRARIES)
 find_library(GOTO_LIBRARIES goto PATHS $ENV{GOTODIR} ${LIB_INSTALL_DIR})
 if(GOTO_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
  set(GOTO_LIBRARIES ${GOTO_LIBRARIES} "-lpthread -lgfortran")
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(GOTO DEFAULT_MSG GOTO_LIBRARIES)
 mark_as_advanced(GOTO_LIBRARIES)
--- a/bench/btl/cmake/FindGOTO2.cmake
+++ b/bench/btl/cmake/FindGOTO2.cmake
@ -1,25 +0,0 @@
 if (GOTO2_LIBRARIES)
  set(GOTO2_FIND_QUIETLY TRUE)
 endif (GOTO2_LIBRARIES)
 # 
 # find_path(GOTO_INCLUDES
 #   NAMES
 #   cblas.h
 #   PATHS
 #   $ENV{GOTODIR}/include
 #   ${INCLUDE_INSTALL_DIR}
 # )
 find_file(GOTO2_LIBRARIES libgoto2.so PATHS /usr/lib $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
 find_library(GOTO2_LIBRARIES goto2 PATHS $ENV{GOTO2DIR} ${LIB_INSTALL_DIR})
 if(GOTO2_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
  set(GOTO2_LIBRARIES ${GOTO2_LIBRARIES} "-lpthread -lgfortran")
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(GOTO2 DEFAULT_MSG
                                  GOTO2_LIBRARIES)
 mark_as_advanced(GOTO2_LIBRARIES)
--- a/bench/btl/cmake/FindOPENBLAS.cmake
+++ b/bench/btl/cmake/FindOPENBLAS.cmake
@ -0,0 +1,17 @@
 if (OPENBLAS_LIBRARIES)
  set(OPENBLAS_FIND_QUIETLY TRUE)
 endif (OPENBLAS_LIBRARIES)
 find_file(OPENBLAS_LIBRARIES libopenblas.so PATHS /usr/lib $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
 find_library(OPENBLAS_LIBRARIES openblas PATHS $ENV{OPENBLASDIR} ${LIB_INSTALL_DIR})
 if(OPENBLAS_LIBRARIES AND CMAKE_COMPILER_IS_GNUCXX)
  set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES} "-lpthread -lgfortran")
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(OPENBLAS DEFAULT_MSG
                                  OPENBLAS_LIBRARIES)
 mark_as_advanced(OPENBLAS_LIBRARIES)
--- a/bench/btl/data/action_settings.txt
+++ b/bench/btl/data/action_settings.txt
@ -1,19 +1,19 @@
-aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:3000
+aat ; "{/*1.5 A x A^T}" ; "matrix size" ; 4:5000
-ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:3000
+ata ; "{/*1.5 A^T x A}" ; "matrix size" ; 4:5000
-atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:3000
+atv ; "{/*1.5 matrix^T x vector}" ; "matrix size" ; 4:5000
 axpby ; "{/*1.5 Y = alpha X + beta Y}" ; "vector size" ; 5:1000000
 axpy ; "{/*1.5 Y += alpha X}" ; "vector size" ; 5:1000000
-matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:3000
+matrix_matrix ; "{/*1.5 matrix matrix product}" ; "matrix size" ; 4:5000
-matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:3000
+matrix_vector ; "{/*1.5 matrix vector product}" ; "matrix size" ; 4:5000
-trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:3000
+trmm ; "{/*1.5 triangular matrix matrix product}" ; "matrix size" ; 4:5000
-trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:3000
+trisolve_vector ; "{/*1.5 triangular solver - vector (X = inv(L) X)}" ; "size" ; 4:5000
-trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:3000
+trisolve_matrix ; "{/*1.5 triangular solver - matrix (M = inv(L) M)}" ; "size" ; 4:5000
-cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:3000
+cholesky ; "{/*1.5 Cholesky decomposition}" ; "matrix size" ; 4:5000
-complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:3000
+complete_lu_decomp ; "{/*1.5 Complete LU decomposition}" ; "matrix size" ; 4:5000
-partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:3000
+partial_lu_decomp ; "{/*1.5 Partial LU decomposition}" ; "matrix size" ; 4:5000
-tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:3000
+tridiagonalization ; "{/*1.5 Tridiagonalization}" ; "matrix size" ; 4:5000
-hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:3000
+hessenberg ; "{/*1.5 Hessenberg decomposition}" ; "matrix size" ; 4:5000
-symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:3000
+symv ; "{/*1.5 symmetric matrix vector product}" ; "matrix size" ; 4:5000
-syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:3000
+syr2 ; "{/*1.5 symmretric rank-2 update (A += u^T v + u v^T)}" ; "matrix size" ; 4:5000
-ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:3000
+ger ; "{/*1.5 general rank-1 update (A += u v^T)}" ; "matrix size" ; 4:5000
 rot ; "{/*1.5 apply rotation in the plane}" ; "vector size" ; 4:1000000
--- a/bench/btl/data/perlib_plot_settings.txt
+++ b/bench/btl/data/perlib_plot_settings.txt
@ -10,7 +10,7 @@ ublas ;           with lines lw 3 lt 1 lc rgbcolor "#00b7ff"
 mtl4 ;            with lines lw 3 lt 1 lc rgbcolor "#d18847"
 blitz ;           with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
 F77 ;             with lines lw 3 lt 3 lc rgbcolor "#e6e64c"
-GOTO ;            with lines lw 3 lt 3 lc rgbcolor "#C05600"
+OPENBLAS ;        with lines lw 3 lt 1 lc rgbcolor "#C05600"
 GOTO2 ;           with lines lw 3 lt 1 lc rgbcolor "#C05600"
 C ;               with lines lw 3 lt 3 lc rgbcolor "#e6bd96"
 ACML ;            with lines lw 2 lt 3 lc rgbcolor "#e6e64c"
 blaze ;           with lines lw 3 lt 1 lc rgbcolor "#ff00ff"
--- a/bench/btl/generic_bench/bench_parameter.hh
+++ b/bench/btl/generic_bench/bench_parameter.hh
@ -33,7 +33,7 @@
 // min matrix size for matrix vector product bench
 #define MIN_MV 5
 // max matrix size for matrix vector product bench
-#define MAX_MV 3000
+#define MAX_MV 5000
 // min matrix size for matrix matrix product bench
 #define MIN_MM 5
 // max matrix size for matrix matrix product bench
--- a/bench/btl/generic_bench/btl.hh
+++ b/bench/btl/generic_bench/btl.hh
@ -176,7 +176,7 @@ public:
    if (_config!=NULL)
    {
      std::vector<BtlString> config = BtlString(_config).split(" \t\n");
-      for (int i = 0; i<config.size(); i++)
+      for (unsigned int i = 0; i<config.size(); i++)
      {
        if (config[i].beginsWith("-a"))
        {
@ -224,7 +224,7 @@ public:
      return false;
    BtlString name(_name);
-    for (int i=0; i<Instance.m_selectedActionNames.size(); ++i)
+    for (unsigned int i=0; i<Instance.m_selectedActionNames.size(); ++i)
      if (name.contains(Instance.m_selectedActionNames[i]))
        return false;
--- a/bench/btl/generic_bench/init/init_function.hh
+++ b/bench/btl/generic_bench/init/init_function.hh
@ -30,23 +30,23 @@ double simple_function(int index_i, int index_j)
  return index_i+index_j;
 }
-double pseudo_random(int index)
+double pseudo_random(int /*index*/)
 {
  return std::rand()/double(RAND_MAX);
 }
-double pseudo_random(int index_i, int index_j)
+double pseudo_random(int /*index_i*/, int /*index_j*/)
 {
  return std::rand()/double(RAND_MAX);
 }
-double null_function(int index)
+double null_function(int /*index*/)
 {
  return 0.0;
 }
-double null_function(int index_i, int index_j)
+double null_function(int /*index_i*/, int /*index_j*/)
 {
  return 0.0;
 }
--- a/bench/btl/generic_bench/utils/size_lin_log.hh
+++ b/bench/btl/generic_bench/utils/size_lin_log.hh
@ -23,7 +23,7 @@
 #include "size_log.hh"
 template<class Vector>
-void size_lin_log(const int nb_point, const int size_min, const int size_max, Vector & X)
+void size_lin_log(const int nb_point, const int /*size_min*/, const int size_max, Vector & X)
 {
  int ten=10;
  int nine=9;
--- a/bench/btl/libs/BLAS/CMakeLists.txt
+++ b/bench/btl/libs/BLAS/CMakeLists.txt
@ -18,27 +18,14 @@ if (MKL_FOUND)
 endif (MKL_FOUND)
-find_package(GOTO2)
+find_package(OPENBLAS)
-if (GOTO2_FOUND)
+if (OPENBLAS_FOUND)
-  btl_add_bench(btl_goto2 main.cpp)
+  btl_add_bench(btl_openblas main.cpp)
-  if(BUILD_btl_goto2)
+  if(BUILD_btl_openblas)
-    target_link_libraries(btl_goto2 ${GOTO_LIBRARIES} )
+    target_link_libraries(btl_openblas ${OPENBLAS_LIBRARIES} )
-    set_target_properties(btl_goto2 PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO2")
+    set_target_properties(btl_openblas PROPERTIES COMPILE_FLAGS "-DCBLASNAME=OPENBLAS")
-  endif(BUILD_btl_goto2)
+  endif(BUILD_btl_openblas)
-endif (GOTO2_FOUND)
+endif (OPENBLAS_FOUND)
 find_package(GOTO)
 if (GOTO_FOUND)
  if(GOTO2_FOUND)
    btl_add_bench(btl_goto main.cpp OFF)
  else()
    btl_add_bench(btl_goto main.cpp)
  endif()
  if(BUILD_btl_goto)
    target_link_libraries(btl_goto ${GOTO_LIBRARIES} )
    set_target_properties(btl_goto PROPERTIES COMPILE_FLAGS "-DCBLASNAME=GOTO")
  endif(BUILD_btl_goto)
 endif (GOTO_FOUND)
 find_package(ACML)
 if (ACML_FOUND)
--- a/bench/btl/libs/BLAS/blas_interface_impl.hh
+++ b/bench/btl/libs/BLAS/blas_interface_impl.hh
@ -75,7 +75,6 @@ public :
  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
    int N2 = N*N;
    BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
    char uplo = 'L';
    int info = 0;
    int * ipiv = (int*)alloca(sizeof(int)*N);
    BLAS_FUNC(getrf)(&N, &N, C, &N, ipiv, &info);
@ -92,7 +91,7 @@ public :
    BLAS_FUNC(trsm)(&right, &lower, &notrans, &nonunit, &N, &N, &fone, L, &N, X, &N);
  }
-  static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & X, int N){
+  static inline void trmm(gene_matrix & A, gene_matrix & B, gene_matrix & /*X*/, int N){
    BLAS_FUNC(trmm)(&left, &lower, &notrans,&nonunit, &N,&N,&fone,A,&N,B,&N);
  }
@ -101,7 +100,6 @@ public :
  static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
    int N2 = N*N;
    BLAS_FUNC(copy)(&N2, X, &intone, C, &intone);
    char uplo = 'L';
    int info = 0;
    int * ipiv = (int*)alloca(sizeof(int)*N);
    int * jpiv = (int*)alloca(sizeof(int)*N);
@ -134,8 +132,6 @@ public :
    }
    char uplo = 'U';
    int info = 0;
    int ilo = 1;
    int ihi = N;
    int bsize = 64;
    int worksize = N*bsize;
    SCALAR* d = new SCALAR[3*N+worksize];
--- a/bench/btl/libs/BLAS/c_interface_base.h
+++ b/bench/btl/libs/BLAS/c_interface_base.h
@ -17,12 +17,12 @@ public:
  typedef real* gene_matrix;
  typedef real* gene_vector;
-  static void free_matrix(gene_matrix & A, int N){
+  static void free_matrix(gene_matrix & A, int /*N*/){
-    delete A;
+    delete[] A;
  }
  static void free_vector(gene_vector & B){
-    delete B;
+    delete[] B;
  }
  static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
--- a/bench/btl/libs/BLAS/main.cpp
+++ b/bench/btl/libs/BLAS/main.cpp
@ -56,13 +56,13 @@ int main()
  bench<Action_trmm<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
-  bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_cholesky<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_partial_lu<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
  #ifdef HAS_LAPACK
-  bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_lu_decomp<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_hessenberg<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_tridiagonalization<blas_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
  #endif
  //bench<Action_lu_solve<blas_LU_solve_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
--- a/bench/btl/libs/STL/STL_interface.hh
+++ b/bench/btl/libs/STL/STL_interface.hh
@ -44,9 +44,9 @@ public :
    return "STL";
  }
-  static void free_matrix(gene_matrix & A, int N){}
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/){}
-  static void free_vector(gene_vector & B){}
+  static void free_vector(gene_vector & /*B*/){}
  static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
    A = A_stl;
--- a/bench/btl/libs/blaze/CMakeLists.txt
+++ b/bench/btl/libs/blaze/CMakeLists.txt
@ -0,0 +1,10 @@
 find_package(BLAZE)
 find_package(Boost)
 if (BLAZE_FOUND AND Boost_FOUND)
  include_directories(${BLAZE_INCLUDE_DIR} ${Boost_INCLUDE_DIRS})
  btl_add_bench(btl_blaze main.cpp)
  if(BUILD_btl_blaze)
    target_link_libraries(btl_blaze ${Boost_LIBRARIES} ${Boost_system_LIBRARY} /opt/local/lib/libboost_system-mt.a )
  endif()
 endif ()
--- a/bench/btl/libs/blaze/blaze_interface.hh
+++ b/bench/btl/libs/blaze/blaze_interface.hh
@ -0,0 +1,140 @@
 //=====================================================
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 //=====================================================
 //
 // This program is free software; you can redistribute it and/or
 // modify it under the terms of the GNU General Public License
 // as published by the Free Software Foundation; either version 2
 // of the License, or (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 //
 #ifndef BLAZE_INTERFACE_HH
 #define BLAZE_INTERFACE_HH
 #include <blaze/Math.h>
 #include <blaze/Blaze.h>
 // using namespace blaze;
 #include <vector>
 template<class real>
 class blaze_interface {
 public :
  typedef real real_type ;
  typedef std::vector<real>        stl_vector;
  typedef std::vector<stl_vector > stl_matrix;
  typedef blaze::DynamicMatrix<real,blaze::columnMajor>  gene_matrix;
  typedef blaze::DynamicVector<real>  gene_vector;
  static inline std::string name() { return "blaze"; }
  static void free_matrix(gene_matrix & A, int N){
    return ;
  }
  static void free_vector(gene_vector & B){
    return ;
  }
  static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
    A.resize(A_stl[0].size(), A_stl.size());
    for (int j=0; j<A_stl.size() ; j++){
      for (int i=0; i<A_stl[j].size() ; i++){
        A(i,j) = A_stl[j][i];
      }
    }
  }
  static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
    B.resize(B_stl.size());
    for (int i=0; i<B_stl.size() ; i++){
      B[i] = B_stl[i];
    }
  }
  static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){
    for (int i=0; i<B_stl.size() ; i++){
      B_stl[i] = B[i];
    }
  }
  static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
    int N=A_stl.size();
    for (int j=0;j<N;j++){
      A_stl[j].resize(N);
      for (int i=0;i<N;i++){
        A_stl[j][i] = A(i,j);
      }
    }
  }
  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
    X = (A*B);
  }
  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
    X = A.transpose()*B.transpose();
  }
  static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
    X = (A.transpose()*A);
  }
  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
    X = (A*A.transpose());
  }
  static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
    X = (A*B);
  }
  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
    X = (A.transpose()*B);
  }
  static inline void axpy(const real coef, const gene_vector & X, gene_vector & Y, int N){
    Y += coef * X;
  }
  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
    Y = a*X + b*Y;
  }
 //   static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
 //     C = X;
 //     recursive_cholesky(C);
 //   }
 //   static inline void lu_decomp(const gene_matrix & X, gene_matrix & R, int N){
 //     R = X;
 //     std::vector<int> ipvt(N);
 //     lu_factor(R, ipvt);
 //   }
 //   static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector & X, int N){
 //     X = lower_trisolve(L, B);
 //   }
  static inline void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
    cible = source;
  }
  static inline void copy_vector(const gene_vector & source, gene_vector & cible, int N){
    cible = source;
  }
 };
 #endif
--- a/bench/btl/libs/blaze/main.cpp
+++ b/bench/btl/libs/blaze/main.cpp
@ -0,0 +1,40 @@
 //=====================================================
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 //=====================================================
 //
 // This program is free software; you can redistribute it and/or
 // modify it under the terms of the GNU General Public License
 // as published by the Free Software Foundation; either version 2
 // of the License, or (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 //
 #include "utilities.h"
 #include "blaze_interface.hh"
 #include "bench.hh"
 #include "basic_actions.hh"
 BTL_MAIN;
 int main()
 {
  bench<Action_axpy<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
  bench<Action_axpby<blaze_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
  bench<Action_matrix_vector_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
 //   bench<Action_atv_product<blaze_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
 //   bench<Action_matrix_matrix_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_ata_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
 //   bench<Action_aat_product<blaze_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
  return 0;
 }
--- a/bench/btl/libs/eigen3/eigen3_interface.hh
+++ b/bench/btl/libs/eigen3/eigen3_interface.hh
@ -45,9 +45,9 @@ public :
    return EIGEN_MAKESTRING(BTL_PREFIX);
  }
-  static void free_matrix(gene_matrix & A, int N) {}
+  static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
-  static void free_vector(gene_vector & B) {}
+  static void free_vector(gene_vector & /*B*/) {}
  static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
    A.resize(A_stl[0].size(), A_stl.size());
@ -74,7 +74,7 @@ public :
  }
  static BTL_DONT_INLINE  void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
-    int N=A_stl.size();
+    int  N=A_stl.size();
    for (int j=0;j<N;j++){
      A_stl[j].resize(N);
@ -84,28 +84,28 @@ public :
    }
  }
-  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
    X.noalias() = A*B;
  }
-  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int N){
+  static inline void transposed_matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int  /*N*/){
    X.noalias() = A.transpose()*B.transpose();
  }
-//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int N){
+//   static inline void ata_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
 //     X.noalias() = A.transpose()*A;
 //   }
-  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int N){
+  static inline void aat_product(const gene_matrix & A, gene_matrix & X, int  /*N*/){
    X.template triangularView<Lower>().setZero();
    X.template selfadjointView<Lower>().rankUpdate(A);
  }
-  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
+  static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
    X.noalias() = A*B;
  }
-  static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N){
+  static inline void symv(const gene_matrix & A, const gene_vector & B, gene_vector & X, int  /*N*/){
    X.noalias() = (A.template selfadjointView<Lower>() * B);
 //     internal::product_selfadjoint_vector<real,0,LowerTriangularBit,false,false>(N,A.data(),N, B.data(), 1, X.data(), 1);
  }
@ -155,54 +155,54 @@ public :
    }
  }
-  static EIGEN_DONT_INLINE void syr2(gene_matrix & A,  gene_vector & X, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void syr2(gene_matrix & A,  gene_vector & X, gene_vector & Y, int  N){
    // internal::product_selfadjoint_rank2_update<real,0,LowerTriangularBit>(N,A.data(),N, X.data(), 1, Y.data(), 1, -1);
    for(int j=0; j<N; ++j)
      A.col(j).tail(N-j) += X[j] * Y.tail(N-j) + Y[j] * X.tail(N-j);
  }
-  static EIGEN_DONT_INLINE void ger(gene_matrix & A,  gene_vector & X, gene_vector & Y, int N){
+  static EIGEN_DONT_INLINE void ger(gene_matrix & A,  gene_vector & X, gene_vector & Y, int  N){
    for(int j=0; j<N; ++j)
      A.col(j) += X * Y[j];
  }
-  static EIGEN_DONT_INLINE void rot(gene_vector & A,  gene_vector & B, real c, real s, int N){
+  static EIGEN_DONT_INLINE void rot(gene_vector & A,  gene_vector & B, real c, real s, int  /*N*/){
    internal::apply_rotation_in_the_plane(A, B, JacobiRotation<real>(c,s));
  }
-  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
+  static inline void atv_product(gene_matrix & A, gene_vector & B, gene_vector & X, int  /*N*/){
    X.noalias() = (A.transpose()*B);
  }
-  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
+  static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int  /*N*/){
    Y += coef * X;
  }
-  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int N){
+  static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int  /*N*/){
    Y = a*X + b*Y;
  }
-  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int N){
+  static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int  /*N*/){
    cible = source;
  }
-  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int N){
+  static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int  /*N*/){
    cible = source;
  }
-  static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int N){
+  static inline void trisolve_lower(const gene_matrix & L, const gene_vector& B, gene_vector& X, int  /*N*/){
    X = L.template triangularView<Lower>().solve(B);
  }
-  static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
+  static inline void trisolve_lower_matrix(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int  /*N*/){
    X = L.template triangularView<Upper>().solve(B);
  }
-  static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int N){
+  static inline void trmm(const gene_matrix & L, const gene_matrix& B, gene_matrix& X, int  /*N*/){
    X.noalias() = L.template triangularView<Lower>() * B;
  }
-  static inline void cholesky(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void cholesky(const gene_matrix & X, gene_matrix & C, int  /*N*/){
    C = X;
    internal::llt_inplace<real,Lower>::blocked(C);
    //C = X.llt().matrixL();
@ -211,11 +211,11 @@ public :
 //     Cholesky<gene_matrix>::computeInPlaceBlock(C);
  }
-  static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void lu_decomp(const gene_matrix & X, gene_matrix & C, int  /*N*/){
    C = X.fullPivLu().matrixLU();
  }
-  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void partial_lu_decomp(const gene_matrix & X, gene_matrix & C, int  N){
    Matrix<DenseIndex,1,Dynamic> piv(N);
    DenseIndex nb;
    C = X;
@ -223,13 +223,13 @@ public :
 //     C = X.partialPivLu().matrixLU();
  }
-  static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void tridiagonalization(const gene_matrix & X, gene_matrix & C, int  N){
    typename Tridiagonalization<gene_matrix>::CoeffVectorType aux(N-1);
    C = X;
    internal::tridiagonalization_inplace(C, aux);
  }
-  static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int N){
+  static inline void hessenberg(const gene_matrix & X, gene_matrix & C, int  /*N*/){
    C = HessenbergDecomposition<gene_matrix>(X).packedMatrix();
  }
--- a/bench/btl/libs/eigen3/main_adv.cpp
+++ b/bench/btl/libs/eigen3/main_adv.cpp
@ -29,14 +29,14 @@ BTL_MAIN;
 int main()
 {
-  bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_trisolve<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_trisolve_matrix<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_cholesky<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_lu_decomp<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_partial_lu<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+//   bench<Action_hessenberg<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
-  bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
+  bench<Action_tridiagonalization<eigen3_interface<REAL_TYPE> > >(MIN_LU,MAX_LU,NB_POINT);
  return 0;
 }
--- a/bench/dense_solvers.cpp
+++ b/bench/dense_solvers.cpp
@ -0,0 +1,76 @@
 #include <iostream>
 #include "BenchTimer.h"
 #include <Eigen/Dense>
 #include <map>
 #include <string>
 using namespace Eigen;
 std::map<std::string,Array<float,1,4> > results;
 template<typename Scalar,int Size>
 void bench(int id, int size = Size)
 {
  typedef Matrix<Scalar,Size,Size> Mat;
  Mat A(size,size);
  A.setRandom();
  A = A*A.adjoint();
  BenchTimer t_llt, t_ldlt, t_lu, t_fplu, t_qr, t_cpqr, t_fpqr, t_jsvd;
  int tries = 3;
  int rep = 1000/size;
  if(rep==0) rep = 1;
  rep = rep*rep;
  LLT<Mat> llt(A);
  LDLT<Mat> ldlt(A);
  PartialPivLU<Mat> lu(A);
  FullPivLU<Mat> fplu(A);
  HouseholderQR<Mat> qr(A);
  ColPivHouseholderQR<Mat> cpqr(A);
  FullPivHouseholderQR<Mat> fpqr(A);
  JacobiSVD<Mat> jsvd(A.rows(),A.cols());
  BENCH(t_llt, tries, rep, llt.compute(A));
  BENCH(t_ldlt, tries, rep, ldlt.compute(A));
  BENCH(t_lu, tries, rep, lu.compute(A));
  BENCH(t_fplu, tries, rep, fplu.compute(A));
  BENCH(t_qr, tries, rep, qr.compute(A));
  BENCH(t_cpqr, tries, rep, cpqr.compute(A));
  BENCH(t_fpqr, tries, rep, fpqr.compute(A));
  if(size<500) // JacobiSVD is really too slow for too large matrices
    BENCH(t_jsvd, tries, rep, jsvd.compute(A,ComputeFullU|ComputeFullV));
  results["LLT"][id] = t_llt.best();
  results["LDLT"][id] = t_ldlt.best();
  results["PartialPivLU"][id] = t_lu.best();
  results["FullPivLU"][id] = t_fplu.best();
  results["HouseholderQR"][id] = t_qr.best();
  results["ColPivHouseholderQR"][id] = t_cpqr.best();
  results["FullPivHouseholderQR"][id] = t_fpqr.best();
  results["JacobiSVD"][id] = size<500 ? t_jsvd.best() : 0;
 }
 int main()
 {
  const int small = 8;
  const int medium = 100;
  const int large = 1000;
  const int xl = 4000;
  bench<float,small>(0);
  bench<float,Dynamic>(1,medium);
  bench<float,Dynamic>(2,large);
  bench<float,Dynamic>(3,xl);
  IOFormat fmt(3, 0, " \t", "\n", "", "");
  std::cout << "solver/size               " << small << "\t" << medium << "\t" << large << "\t" << xl << "\n";
  std::cout << "LLT                 (ms)  " << (results["LLT"]/1000.).format(fmt) << "\n";
  std::cout << "LDLT                 (%)  " << (results["LDLT"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "PartialPivLU         (%)  " << (results["PartialPivLU"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "FullPivLU            (%)  " << (results["FullPivLU"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "HouseholderQR        (%)  " << (results["HouseholderQR"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "ColPivHouseholderQR  (%)  " << (results["ColPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "FullPivHouseholderQR (%)  " << (results["FullPivHouseholderQR"]/results["LLT"]).format(fmt) << "\n";
  std::cout << "JacobiSVD            (%)  " << (results["JacobiSVD"]/results["LLT"]).format(fmt) << "\n";
 }
--- a/bench/spbench/spbenchstyle.h
+++ b/bench/spbench/spbenchstyle.h
@ -91,4 +91,5 @@ void printBenchStyle(std::ofstream& out)
  </xsl:stylesheet>\n\n";
 }
 #endif
--- a/blas/README.txt
+++ b/blas/README.txt
@ -1,9 +1,6 @@
 This directory contains a BLAS library built on top of Eigen.
 This is currently a work in progress which is far to be ready for use,
 but feel free to contribute to it if you wish.
 This module is not built by default. In order to compile it, you need to
 type 'make blas' from within your build dir.
--- a/blas/common.h
+++ b/blas/common.h
@ -106,13 +106,13 @@ matrix(T* data, int rows, int cols, int stride)
 }
 template<typename T>
-Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > vector(T* data, int size, int incr)
+Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> > make_vector(T* data, int size, int incr)
 {
  return Map<Matrix<T,Dynamic,1>, 0, InnerStride<Dynamic> >(data, size, InnerStride<Dynamic>(incr));
 }
 template<typename T>
-Map<Matrix<T,Dynamic,1> > vector(T* data, int size)
+Map<Matrix<T,Dynamic,1> > make_vector(T* data, int size)
 {
  return Map<Matrix<T,Dynamic,1> >(data, size);
 }
@ -124,8 +124,8 @@ T* get_compact_vector(T* x, int n, int incx)
    return x;
  T* ret = new Scalar[n];
-  if(incx<0) vector(ret,n) = vector(x,n,-incx).reverse();
+  if(incx<0) make_vector(ret,n) = make_vector(x,n,-incx).reverse();
-  else       vector(ret,n) = vector(x,n, incx);
+  else       make_vector(ret,n) = make_vector(x,n, incx);
  return ret;
 }
@ -135,8 +135,8 @@ T* copy_back(T* x_cpy, T* x, int n, int incx)
  if(x_cpy==x)
    return 0;
-  if(incx<0) vector(x,n,-incx).reverse() = vector(x_cpy,n);
+  if(incx<0) make_vector(x,n,-incx).reverse() = make_vector(x_cpy,n);
-  else       vector(x,n, incx)           = vector(x_cpy,n);
+  else       make_vector(x,n, incx)           = make_vector(x_cpy,n);
  return x_cpy;
 }
--- a/blas/double.cpp
+++ b/blas/double.cpp
@ -23,11 +23,10 @@ double BLASFUNC(dsdot)(int* n, float* x, int* incx, float* y, int* incy)
 {
  if(*n<=0) return 0;
-  if(*incx==1 && *incy==1)    return (vector(x,*n).cast<double>().cwiseProduct(vector(y,*n).cast<double>())).sum();
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cast<double>().cwiseProduct(make_vector(y,*n).cast<double>())).sum();
-  else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
-  else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,*incy).cast<double>())).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,*incy).cast<double>())).sum();
-  else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
-  else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(vector(y,*n,-*incy).reverse().cast<double>())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cast<double>().cwiseProduct(make_vector(y,*n,-*incy).reverse().cast<double>())).sum();
  else return 0;
 }
--- a/blas/level1_cplx_impl.h
+++ b/blas/level1_cplx_impl.h
@ -32,13 +32,14 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),asum_)(int *n,
  if(*n<=0) return 0;
-  if(*incx==1)  return vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
+  if(*incx==1)  return make_vector(x,*n).unaryExpr<scalar_norm1_op>().sum();
-  else          return vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).unaryExpr<scalar_norm1_op>().sum();
 }
 // computes a dot product of a conjugated vector with another vector.
 int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar* pres)
 {
 //   std::cerr << "_dotc " << *n << " " << *incx << " " << *incy << "\n";
  Scalar* res = reinterpret_cast<Scalar*>(pres);
  if(*n<=0)
@ -50,11 +51,11 @@ int EIGEN_BLAS_FUNC(dotcw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
-  if(*incx==1 && *incy==1)    *res = (vector(x,*n).dot(vector(y,*n)));
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).dot(make_vector(y,*n)));
-  else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).dot(vector(y,*n,*incy)));
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,*incy)));
-  else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,*incy)));
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,*incy)));
-  else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).dot(vector(y,*n,-*incy).reverse()));
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).dot(make_vector(y,*n,-*incy).reverse()));
-  else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().dot(vector(y,*n,-*incy).reverse()));
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().dot(make_vector(y,*n,-*incy).reverse()));
  return 0;
 }
@ -72,11 +73,11 @@ int EIGEN_BLAS_FUNC(dotuw)(int *n, RealScalar *px, int *incx, RealScalar *py, in
  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
-  if(*incx==1 && *incy==1)    *res = (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
+  if(*incx==1 && *incy==1)    *res = (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
-  else if(*incx>0 && *incy>0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy>0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
-  else if(*incx<0 && *incy>0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
-  else if(*incx>0 && *incy<0) *res = (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx>0 && *incy<0) *res = (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
-  else if(*incx<0 && *incy<0) *res = (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) *res = (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
  return 0;
 }
@ -88,9 +89,9 @@ RealScalar EIGEN_CAT(EIGEN_CAT(REAL_SCALAR_SUFFIX,SCALAR_SUFFIX),nrm2_)(int *n,
  Scalar* x = reinterpret_cast<Scalar*>(px);
  if(*incx==1)
-    return vector(x,*n).stableNorm();
+    return make_vector(x,*n).stableNorm();
-  return vector(x,*n,*incx).stableNorm();
+  return make_vector(x,*n,*incx).stableNorm();
 }
 int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@ -102,8 +103,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),rot_)(int *n, RealScal
  RealScalar c = *pc;
  RealScalar s = *ps;
-  StridedVectorType vx(vector(x,*n,std::abs(*incx)));
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
-  StridedVectorType vy(vector(y,*n,std::abs(*incy)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
  Reverse<StridedVectorType> rvx(vx);
  Reverse<StridedVectorType> rvy(vy);
@ -125,9 +126,8 @@ int EIGEN_CAT(EIGEN_CAT(SCALAR_SUFFIX,REAL_SCALAR_SUFFIX),scal_)(int *n, RealSca
 //   std::cerr << "__scal " << *n << " " << alpha << " " << *incx << "\n";
-  if(*incx==1)  vector(x,*n) *= alpha;
+  if(*incx==1)  make_vector(x,*n) *= alpha;
-  else          vector(x,*n,std::abs(*incx)) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
  return 0;
 }
--- a/blas/level1_impl.h
+++ b/blas/level1_impl.h
@ -17,11 +17,11 @@ int EIGEN_BLAS_FUNC(axpy)(int *n, RealScalar *palpha, RealScalar *px, int *incx,
  if(*n<=0) return 0;
-  if(*incx==1 && *incy==1)    vector(y,*n) += alpha * vector(x,*n);
+  if(*incx==1 && *incy==1)    make_vector(y,*n) += alpha * make_vector(x,*n);
-  else if(*incx>0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,*incx);
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,*incx);
-  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,*incx);
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,*incx);
-  else if(*incx<0 && *incy>0) vector(y,*n,*incy) += alpha * vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy) += alpha * make_vector(x,*n,-*incx).reverse();
-  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse() += alpha * vector(x,*n,-*incx).reverse();
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse() += alpha * make_vector(x,*n,-*incx).reverse();
  return 0;
 }
@ -35,7 +35,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int
  // be carefull, *incx==0 is allowed !!
  if(*incx==1 && *incy==1)
-    vector(y,*n) = vector(x,*n);
+    make_vector(y,*n) = make_vector(x,*n);
  else
  {
    if(*incx<0) x = x - (*n-1)*(*incx);
@ -57,8 +57,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amax_)(int *n, RealScalar *px, int *inc
  Scalar* x = reinterpret_cast<Scalar*>(px);
  DenseIndex ret;
-  if(*incx==1)  vector(x,*n).cwiseAbs().maxCoeff(&ret);
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().maxCoeff(&ret);
-  else          vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().maxCoeff(&ret);
  return ret+1;
 }
@ -68,8 +68,8 @@ int EIGEN_CAT(EIGEN_CAT(i,SCALAR_SUFFIX),amin_)(int *n, RealScalar *px, int *inc
  Scalar* x = reinterpret_cast<Scalar*>(px);
  DenseIndex ret;
-  if(*incx==1)  vector(x,*n).cwiseAbs().minCoeff(&ret);
+  if(*incx==1)  make_vector(x,*n).cwiseAbs().minCoeff(&ret);
-  else          vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
+  else          make_vector(x,*n,std::abs(*incx)).cwiseAbs().minCoeff(&ret);
  return ret+1;
 }
@ -143,8 +143,8 @@ int EIGEN_BLAS_FUNC(scal)(int *n, RealScalar *palpha, RealScalar *px, int *incx)
  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar alpha = *reinterpret_cast<Scalar*>(palpha);
-  if(*incx==1)  vector(x,*n) *= alpha;
+  if(*incx==1)  make_vector(x,*n) *= alpha;
-  else          vector(x,*n,std::abs(*incx)) *= alpha;
+  else          make_vector(x,*n,std::abs(*incx)) *= alpha;
  return 0;
 }
@ -156,12 +156,11 @@ int EIGEN_BLAS_FUNC(swap)(int *n, RealScalar *px, int *incx, RealScalar *py, int
  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
-  if(*incx==1 && *incy==1)    vector(y,*n).swap(vector(x,*n));
+  if(*incx==1 && *incy==1)    make_vector(y,*n).swap(make_vector(x,*n));
-  else if(*incx>0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,*incx));
+  else if(*incx>0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,*incx));
-  else if(*incx>0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,*incx));
+  else if(*incx>0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,*incx));
-  else if(*incx<0 && *incy>0) vector(y,*n,*incy).swap(vector(x,*n,-*incx).reverse());
+  else if(*incx<0 && *incy>0) make_vector(y,*n,*incy).swap(make_vector(x,*n,-*incx).reverse());
-  else if(*incx<0 && *incy<0) vector(y,*n,-*incy).reverse().swap(vector(x,*n,-*incx).reverse());
+  else if(*incx<0 && *incy<0) make_vector(y,*n,-*incy).reverse().swap(make_vector(x,*n,-*incx).reverse());
  return 1;
 }
--- a/blas/level1_real_impl.h
+++ b/blas/level1_real_impl.h
@ -19,8 +19,8 @@ RealScalar EIGEN_BLAS_FUNC(asum)(int *n, RealScalar *px, int *incx)
  if(*n<=0) return 0;
-  if(*incx==1)  return vector(x,*n).cwiseAbs().sum();
+  if(*incx==1)  return make_vector(x,*n).cwiseAbs().sum();
-  else          return vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
+  else          return make_vector(x,*n,std::abs(*incx)).cwiseAbs().sum();
 }
 // computes a vector-vector dot product.
@ -33,11 +33,11 @@ Scalar EIGEN_BLAS_FUNC(dot)(int *n, RealScalar *px, int *incx, RealScalar *py, i
  Scalar* x = reinterpret_cast<Scalar*>(px);
  Scalar* y = reinterpret_cast<Scalar*>(py);
-  if(*incx==1 && *incy==1)    return (vector(x,*n).cwiseProduct(vector(y,*n))).sum();
+  if(*incx==1 && *incy==1)    return (make_vector(x,*n).cwiseProduct(make_vector(y,*n))).sum();
-  else if(*incx>0 && *incy>0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx>0 && *incy>0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,*incy))).sum();
-  else if(*incx<0 && *incy>0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,*incy))).sum();
+  else if(*incx<0 && *incy>0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,*incy))).sum();
-  else if(*incx>0 && *incy<0) return (vector(x,*n,*incx).cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx>0 && *incy<0) return (make_vector(x,*n,*incx).cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
-  else if(*incx<0 && *incy<0) return (vector(x,*n,-*incx).reverse().cwiseProduct(vector(y,*n,-*incy).reverse())).sum();
+  else if(*incx<0 && *incy<0) return (make_vector(x,*n,-*incx).reverse().cwiseProduct(make_vector(y,*n,-*incy).reverse())).sum();
  else return 0;
 }
@ -50,8 +50,8 @@ Scalar EIGEN_BLAS_FUNC(nrm2)(int *n, RealScalar *px, int *incx)
  Scalar* x = reinterpret_cast<Scalar*>(px);
-  if(*incx==1)  return vector(x,*n).stableNorm();
+  if(*incx==1)  return make_vector(x,*n).stableNorm();
-  else          return vector(x,*n,std::abs(*incx)).stableNorm();
+  else          return make_vector(x,*n,std::abs(*incx)).stableNorm();
 }
 int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int *incy, RealScalar *pc, RealScalar *ps)
@ -64,8 +64,8 @@ int EIGEN_BLAS_FUNC(rot)(int *n, RealScalar *px, int *incx, RealScalar *py, int
  Scalar c = *reinterpret_cast<Scalar*>(pc);
  Scalar s = *reinterpret_cast<Scalar*>(ps);
-  StridedVectorType vx(vector(x,*n,std::abs(*incx)));
+  StridedVectorType vx(make_vector(x,*n,std::abs(*incx)));
-  StridedVectorType vy(vector(y,*n,std::abs(*incy)));
+  StridedVectorType vy(make_vector(y,*n,std::abs(*incy)));
  Reverse<StridedVectorType> rvx(vx);
  Reverse<StridedVectorType> rvy(vy);
--- a/blas/level2_cplx_impl.h
+++ b/blas/level2_cplx_impl.h
@ -57,8 +57,8 @@ int EIGEN_BLAS_FUNC(hemv)(char *uplo, int *n, RealScalar *palpha, RealScalar *pa
  if(beta!=Scalar(1))
  {
-    if(beta==Scalar(0)) vector(actual_y, *n).setZero();
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
-    else                vector(actual_y, *n) *= beta;
+    else                make_vector(actual_y, *n) *= beta;
  }
  if(alpha!=Scalar(0))
--- a/blas/level2_impl.h
+++ b/blas/level2_impl.h
@ -58,8 +58,8 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
  if(beta!=Scalar(1))
  {
-    if(beta==Scalar(0)) vector(actual_c, actual_m).setZero();
+    if(beta==Scalar(0)) make_vector(actual_c, actual_m).setZero();
-    else                vector(actual_c, actual_m) *= beta;
+    else                make_vector(actual_c, actual_m) *= beta;
  }
  if(code>=4 || func[code]==0)
@ -232,8 +232,8 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
  if(beta!=Scalar(1))
  {
-    if(beta==Scalar(0)) vector(actual_y, actual_m).setZero();
+    if(beta==Scalar(0)) make_vector(actual_y, actual_m).setZero();
-    else                vector(actual_y, actual_m) *= beta;
+    else                make_vector(actual_y, actual_m) *= beta;
  }
  MatrixType mat_coeffs(a,coeff_rows,*n,*lda);
@ -246,11 +246,11 @@ int EIGEN_BLAS_FUNC(gbmv)(char *trans, int *m, int *n, int *kl, int *ku, RealSca
    int len = end - start + 1;
    int offset = (*ku) - j + start;
    if(OP(*trans)==NOTR)
-      vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
    else if(OP(*trans)==TR)
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
    else
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
  }
  if(actual_x!=x) delete[] actual_x;
@ -304,11 +304,11 @@ int EIGEN_BLAS_FUNC(tbmv)(char *uplo, char *opa, char *diag, int *n, int *k, Rea
    int offset = (ku) - j + start;
    if(OP(*trans)==NOTR)
-      vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
+      make_vector(actual_y+start,len) += (alpha*actual_x[j]) * mat_coeffs.col(j).segment(offset,len);
    else if(OP(*trans)==TR)
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).transpose() * make_vector(actual_x+start,len) ).value();
    else
-      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * vector(actual_x+start,len) ).value();
+      actual_y[j] += alpha * ( mat_coeffs.col(j).segment(offset,len).adjoint()   * make_vector(actual_x+start,len) ).value();
  }
  if(actual_x!=x) delete[] actual_x;
@ -521,4 +521,3 @@ int EIGEN_BLAS_FUNC(tpsv)(char *uplo, char *opa, char *diag, int *n, RealScalar
  return 1;
 }
--- a/blas/level2_real_impl.h
+++ b/blas/level2_real_impl.h
@ -51,8 +51,8 @@ int EIGEN_BLAS_FUNC(symv) (char *uplo, int *n, RealScalar *palpha, RealScalar *p
  if(beta!=Scalar(1))
  {
-    if(beta==Scalar(0)) vector(actual_y, *n).setZero();
+    if(beta==Scalar(0)) make_vector(actual_y, *n).setZero();
-    else                vector(actual_y, *n) *= beta;
+    else                make_vector(actual_y, *n) *= beta;
  }
  int code = UPLO(*uplo);
@ -366,5 +366,3 @@ int EIGEN_BLAS_FUNC(ger)(int *m, int *n, Scalar *palpha, Scalar *px, int *incx,
  return 1;
 }
--- a/blas/xerbla.cpp
+++ b/blas/xerbla.cpp
@ -1,7 +1,7 @@
 #include <iostream>
-#if (defined __GNUC__)
+#if (defined __GNUC__) && (!defined __MINGW32__)
 #define EIGEN_WEAK_LINKING __attribute__ ((weak))
 #else
 #define EIGEN_WEAK_LINKING
--- a/cmake/Eigen3Config.cmake.in
+++ b/cmake/Eigen3Config.cmake.in
@ -3,26 +3,26 @@
 #  Eigen3Config.cmake(.in)
 # Use the following variables to compile and link against Eigen:
-#  EIGEN_FOUND              - True if Eigen was found on your system
+#  EIGEN3_FOUND              - True if Eigen was found on your system
-#  EIGEN_USE_FILE           - The file making Eigen usable
+#  EIGEN3_USE_FILE           - The file making Eigen usable
-#  EIGEN_DEFINITIONS        - Definitions needed to build with Eigen
+#  EIGEN3_DEFINITIONS        - Definitions needed to build with Eigen
-#  EIGEN_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
+#  EIGEN3_INCLUDE_DIR        - Directory where signature_of_eigen3_matrix_library can be found
-#  EIGEN_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
+#  EIGEN3_INCLUDE_DIRS       - List of directories of Eigen and it's dependencies
-#  EIGEN_ROOT_DIR           - The base directory of Eigen
+#  EIGEN3_ROOT_DIR           - The base directory of Eigen
-#  EIGEN_VERSION_STRING     - A human-readable string containing the version
+#  EIGEN3_VERSION_STRING     - A human-readable string containing the version
-#  EIGEN_VERSION_MAJOR      - The major version of Eigen
+#  EIGEN3_VERSION_MAJOR      - The major version of Eigen
-#  EIGEN_VERSION_MINOR      - The minor version of Eigen
+#  EIGEN3_VERSION_MINOR      - The minor version of Eigen
-#  EIGEN_VERSION_PATCH      - The patch version of Eigen
+#  EIGEN3_VERSION_PATCH      - The patch version of Eigen
-set ( EIGEN_FOUND 1 )
+set ( EIGEN3_FOUND 1 )
-set ( EIGEN_USE_FILE     "@EIGEN_USE_FILE@" )
+set ( EIGEN3_USE_FILE     "@EIGEN_USE_FILE@" )
-set ( EIGEN_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
+set ( EIGEN3_DEFINITIONS  "@EIGEN_DEFINITIONS@" )
-set ( EIGEN_INCLUDE_DIR  "@EIGEN_INCLUDE_DIR@" )
+set ( EIGEN3_INCLUDE_DIR  "@EIGEN_INCLUDE_DIR@" )
-set ( EIGEN_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
+set ( EIGEN3_INCLUDE_DIRS "@EIGEN_INCLUDE_DIRS@" )
-set ( EIGEN_ROOT_DIR     "@EIGEN_ROOT_DIR@" )
+set ( EIGEN3_ROOT_DIR     "@EIGEN_ROOT_DIR@" )
-set ( EIGEN_VERSION_STRING "@EIGEN_VERSION_STRING@" )
+set ( EIGEN3_VERSION_STRING "@EIGEN_VERSION_STRING@" )
-set ( EIGEN_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
+set ( EIGEN3_VERSION_MAJOR  "@EIGEN_VERSION_MAJOR@" )
-set ( EIGEN_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
+set ( EIGEN3_VERSION_MINOR  "@EIGEN_VERSION_MINOR@" )
-set ( EIGEN_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )
+set ( EIGEN3_VERSION_PATCH  "@EIGEN_VERSION_PATCH@" )
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,3 @@`
							`Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.`

							`For more information go to http://eigen.tuxfamily.org/.`