From 3930c9b0869ae2244fbacaba0e83217accfac6c6 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 28 Feb 2013 19:31:03 +0100
Subject: [PATCH] Fix "routine is both "inline" and "noinline"" warnings

---
 Eigen/src/Core/GeneralProduct.h               |  62 ++-
 .../Core/products/GeneralBlockPanelKernel.h   | 322 +++++++-------
 Eigen/src/Core/products/GeneralMatrixMatrix.h |   3 +-
 Eigen/src/Core/products/GeneralMatrixVector.h |  23 +-
 .../Core/products/GeneralMatrixVector_MKL.h   |   6 +-
 .../Core/products/SelfadjointMatrixMatrix.h   |  24 +-
 .../products/SelfadjointMatrixMatrix_MKL.h    |  10 +-
 .../Core/products/SelfadjointMatrixVector.h   |  10 +-
 .../products/SelfadjointMatrixVector_MKL.h    |   4 +-
 .../Core/products/TriangularMatrixMatrix.h    |  32 +-
 .../products/TriangularMatrixMatrix_MKL.h     |   4 +-
 .../Core/products/TriangularMatrixVector.h    |  20 +-
 .../products/TriangularMatrixVector_MKL.h     |  12 +-
 .../Core/products/TriangularSolverMatrix.h    |  18 +-
 .../products/TriangularSolverMatrix_MKL.h     |   4 +-
 Eigen/src/SparseCore/SparseMatrix.h           | 396 +++++++++---------
 Eigen/src/SparseCore/SparseVector.h           |  52 +--
 17 files changed, 560 insertions(+), 442 deletions(-)
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h
index 086eac32d..4e6448353 100644
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@@ -222,7 +222,29 @@ class GeneralProduct<Lhs, Rhs, InnerProduct>
 ***********************************************************************/
 
 namespace internal {
-template<int StorageOrder> struct outer_product_selector;
+
+// Column major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
+{
+  typedef typename Dest::Index Index;
+  // FIXME make sure lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dest.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dest.col(j), prod.rhs().coeff(j) * prod.lhs());
+}
+
+// Row major
+template<typename ProductType, typename Dest, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
+  typedef typename Dest::Index Index;
+  // FIXME make sure rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dest.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dest.row(i), prod.lhs().coeff(i) * prod.rhs());
+}
 
 template<typename Lhs, typename Rhs>
 struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
@@ -235,6 +257,8 @@ template<typename Lhs, typename Rhs>
 class GeneralProduct<Lhs, Rhs, OuterProduct>
   : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
 {
+    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+    
   public:
     EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
 
@@ -257,53 +281,25 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
     
     template<typename Dest>
     inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector<(int(Dest::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dest, set());
+      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
     }
     
     template<typename Dest>
     inline void addTo(Dest& dest) const {
-      internal::outer_product_selector<(int(Dest::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dest, add());
+      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
     }
 
     template<typename Dest>
     inline void subTo(Dest& dest) const {
-      internal::outer_product_selector<(int(Dest::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dest, sub());
+      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
     }
 
     template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
     {
-      internal::outer_product_selector<(int(Dest::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dest, adds(alpha));
+      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
     }
 };
 
-namespace internal {
-
-template<> struct outer_product_selector<ColMajor> {
-  template<typename ProductType, typename Dest, typename Func>
-  static EIGEN_DONT_INLINE void run(const ProductType& prod, Dest& dest, const Func& func) {
-    typedef typename Dest::Index Index;
-    // FIXME make sure lhs is sequentially stored
-    // FIXME not very good if rhs is real and lhs complex while alpha is real too
-    const Index cols = dest.cols();
-    for (Index j=0; j<cols; ++j)
-      func(dest.col(j), prod.rhs().coeff(j) * prod.lhs());
-  }
-};
-
-template<> struct outer_product_selector<RowMajor> {
-  template<typename ProductType, typename Dest, typename Func>
-  static EIGEN_DONT_INLINE void run(const ProductType& prod, Dest& dest, const Func& func) {
-    typedef typename Dest::Index Index;
-    // FIXME make sure rhs is sequentially stored
-    // FIXME not very good if lhs is real and rhs complex while alpha is real too
-    const Index rows = dest.rows();
-    for (Index i=0; i<rows; ++i)
-      func(dest.row(i), prod.lhs().coeff(i) * prod.rhs());
-  }
-};
-
-} // end namespace internal
-
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 98c80104a..780fa74d3 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -529,7 +529,14 @@ struct gebp_kernel
 
   EIGEN_DONT_INLINE
   void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
   {
     Traits traits;
     
@@ -1089,7 +1096,7 @@ EIGEN_ASM_COMMENT("mybegin4");
       }
     }
   }
-};
+
 
 #undef CJMADD
 
@@ -1110,81 +1117,84 @@ EIGEN_ASM_COMMENT("mybegin4");
 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
-                  Index stride=0, Index offset=0)
-  {
-    typedef typename packet_traits<Scalar>::type Packet;
-    enum { PacketSize = packet_traits<Scalar>::size };
-
-    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
-    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
-    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-    const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
-    Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
-    for(Index i=0; i<peeled_mc; i+=Pack1)
-    {
-      if(PanelMode) count += Pack1 * offset;
-
-      if(StorageOrder==ColMajor)
-      {
-        for(Index k=0; k<depth; k++)
-        {
-          Packet A, B, C, D;
-          if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-          if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-          if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
-          if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
-          if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
-          if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
-          if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
-          if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
-        }
-      }
-      else
-      {
-        for(Index k=0; k<depth; k++)
-        {
-          // TODO add a vectorized transpose here
-          Index w=0;
-          for(; w<Pack1-3; w+=4)
-          {
-            Scalar a(cj(lhs(i+w+0, k))),
-                   b(cj(lhs(i+w+1, k))),
-                   c(cj(lhs(i+w+2, k))),
-                   d(cj(lhs(i+w+3, k)));
-            blockA[count++] = a;
-            blockA[count++] = b;
-            blockA[count++] = c;
-            blockA[count++] = d;
-          }
-          if(Pack1%4)
-            for(;w<Pack1;++w)
-              blockA[count++] = cj(lhs(i+w, k));
-        }
-      }
-      if(PanelMode) count += Pack1 * (stride-offset-depth);
-    }
-    if(rows-peeled_mc>=Pack2)
-    {
-      if(PanelMode) count += Pack2*offset;
-      for(Index k=0; k<depth; k++)
-        for(Index w=0; w<Pack2; w++)
-          blockA[count++] = cj(lhs(peeled_mc+w, k));
-      if(PanelMode) count += Pack2 * (stride-offset-depth);
-      peeled_mc += Pack2;
-    }
-    for(Index i=peeled_mc; i<rows; i++)
-    {
-      if(PanelMode) count += offset;
-      for(Index k=0; k<depth; k++)
-        blockA[count++] = cj(lhs(i, k));
-      if(PanelMode) count += (stride-offset-depth);
-    }
-  }
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
+  Index count = 0;
+  Index peeled_mc = (rows/Pack1)*Pack1;
+  for(Index i=0; i<peeled_mc; i+=Pack1)
+  {
+    if(PanelMode) count += Pack1 * offset;
+
+    if(StorageOrder==ColMajor)
+    {
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B, C, D;
+        if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+        if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+        if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+        if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
+        if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
+        if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
+        if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
+        if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
+      }
+    }
+    else
+    {
+      for(Index k=0; k<depth; k++)
+      {
+        // TODO add a vectorized transpose here
+        Index w=0;
+        for(; w<Pack1-3; w+=4)
+        {
+          Scalar a(cj(lhs(i+w+0, k))),
+                  b(cj(lhs(i+w+1, k))),
+                  c(cj(lhs(i+w+2, k))),
+                  d(cj(lhs(i+w+3, k)));
+          blockA[count++] = a;
+          blockA[count++] = b;
+          blockA[count++] = c;
+          blockA[count++] = d;
+        }
+        if(Pack1%4)
+          for(;w<Pack1;++w)
+            blockA[count++] = cj(lhs(i+w, k));
+      }
+    }
+    if(PanelMode) count += Pack1 * (stride-offset-depth);
+  }
+  if(rows-peeled_mc>=Pack2)
+  {
+    if(PanelMode) count += Pack2*offset;
+    for(Index k=0; k<depth; k++)
+      for(Index w=0; w<Pack2; w++)
+        blockA[count++] = cj(lhs(peeled_mc+w, k));
+    if(PanelMode) count += Pack2 * (stride-offset-depth);
+    peeled_mc += Pack2;
+  }
+  for(Index i=peeled_mc; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
 // copy a complete panel of the rhs
 // this version is optimized for column major matrices
 // The traversal order is as follow: (nr==4):
@@ -1197,93 +1207,99 @@ struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
-                  Index stride=0, Index offset=0)
-  {
-    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
-    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-    Index packet_cols = (cols/nr) * nr;
-    Index count = 0;
-    for(Index j2=0; j2<packet_cols; j2+=nr)
-    {
-      // skip what we have before
-      if(PanelMode) count += nr * offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-      const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-      for(Index k=0; k<depth; k++)
-      {
-                  blockB[count+0] = cj(b0[k]);
-                  blockB[count+1] = cj(b1[k]);
-        if(nr==4) blockB[count+2] = cj(b2[k]);
-        if(nr==4) blockB[count+3] = cj(b3[k]);
-        count += nr;
-      }
-      // skip what we have after
-      if(PanelMode) count += nr * (stride-offset-depth);
-    }
-
-    // copy the remaining columns one at a time (nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
-    {
-      if(PanelMode) count += offset;
-      const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-      for(Index k=0; k<depth; k++)
-      {
-        blockB[count] = cj(b0[k]);
-        count += 1;
-      }
-      if(PanelMode) count += (stride-offset-depth);
-    }
-  }
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols = (cols/nr) * nr;
+  Index count = 0;
+  for(Index j2=0; j2<packet_cols; j2+=nr)
+  {
+    // skip what we have before
+    if(PanelMode) count += nr * offset;
+    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+    const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+    const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+    for(Index k=0; k<depth; k++)
+    {
+                blockB[count+0] = cj(b0[k]);
+                blockB[count+1] = cj(b1[k]);
+      if(nr==4) blockB[count+2] = cj(b2[k]);
+      if(nr==4) blockB[count+3] = cj(b3[k]);
+      count += nr;
+    }
+    // skip what we have after
+    if(PanelMode) count += nr * (stride-offset-depth);
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols; j2<cols; ++j2)
+  {
+    if(PanelMode) count += offset;
+    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    for(Index k=0; k<depth; k++)
+    {
+      blockB[count] = cj(b0[k]);
+      count += 1;
+    }
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
 // this version is optimized for row major matrices
 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
 {
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
-                  Index stride=0, Index offset=0)
-  {
-    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
-    eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-    Index packet_cols = (cols/nr) * nr;
-    Index count = 0;
-    for(Index j2=0; j2<packet_cols; j2+=nr)
-    {
-      // skip what we have before
-      if(PanelMode) count += nr * offset;
-      for(Index k=0; k<depth; k++)
-      {
-        const Scalar* b0 = &rhs[k*rhsStride + j2];
-                  blockB[count+0] = cj(b0[0]);
-                  blockB[count+1] = cj(b0[1]);
-        if(nr==4) blockB[count+2] = cj(b0[2]);
-        if(nr==4) blockB[count+3] = cj(b0[3]);
-        count += nr;
-      }
-      // skip what we have after
-      if(PanelMode) count += nr * (stride-offset-depth);
-    }
-    // copy the remaining columns one at a time (nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
-    {
-      if(PanelMode) count += offset;
-      const Scalar* b0 = &rhs[j2];
-      for(Index k=0; k<depth; k++)
-      {
-        blockB[count] = cj(b0[k*rhsStride]);
-        count += 1;
-      }
-      if(PanelMode) count += stride-offset-depth;
-    }
-  }
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols = (cols/nr) * nr;
+  Index count = 0;
+  for(Index j2=0; j2<packet_cols; j2+=nr)
+  {
+    // skip what we have before
+    if(PanelMode) count += nr * offset;
+    for(Index k=0; k<depth; k++)
+    {
+      const Scalar* b0 = &rhs[k*rhsStride + j2];
+                blockB[count+0] = cj(b0[0]);
+                blockB[count+1] = cj(b0[1]);
+      if(nr==4) blockB[count+2] = cj(b0[2]);
+      if(nr==4) blockB[count+3] = cj(b0[3]);
+      count += nr;
+    }
+    // skip what we have after
+    if(PanelMode) count += nr * (stride-offset-depth);
+  }
+  // copy the remaining columns one at a time (nr==1)
+  for(Index j2=packet_cols; j2<cols; ++j2)
+  {
+    if(PanelMode) count += offset;
+    const Scalar* b0 = &rhs[j2];
+    for(Index k=0; k<depth; k++)
+    {
+      blockB[count] = cj(b0[k*rhsStride]);
+      count += 1;
+    }
+    if(PanelMode) count += stride-offset-depth;
+  }
+}
+
 } // end namespace internal
 
 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 0c1f35c69..3f5ffcf51 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -50,6 +50,7 @@ template<
   typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {
+
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
@@ -169,7 +170,6 @@ static void run(Index rows, Index cols, Index depth,
       // vertical panel which is, in practice, a very low number.
       pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
 
-
       // For each mc x kc block of the lhs's vertical panel...
       // (==GEPP_VAR1)
       for(Index i2=0; i2<rows; i2+=mc)
@@ -183,7 +183,6 @@ static void run(Index rows, Index cols, Index depth,
 
         // Everything is packed, we can now call the block * panel kernel:
         gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
-
       }
     }
   }
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 8895d3ab2..9bdd588df 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -49,6 +49,18 @@ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
 EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+    resIncr
+  #endif
+  , RhsScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsScalar* lhs, Index lhsStride,
   const RhsScalar* rhs, Index rhsIncr,
@@ -274,7 +286,6 @@ EIGEN_DONT_INLINE static void run(
   } while(Vectorizable);
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
-};
 
 /* Optimized row-major matrix * vector product:
  * This algorithm processes 4 rows at onces that allows to both reduce
@@ -308,6 +319,15 @@ typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
   
 EIGEN_DONT_INLINE static void run(
+  Index rows, Index cols,
+  const LhsScalar* lhs, Index lhsStride,
+  const RhsScalar* rhs, Index rhsIncr,
+  ResScalar* res, Index resIncr,
+  ResScalar alpha);
+};
+
+template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
   Index rows, Index cols,
   const LhsScalar* lhs, Index lhsStride,
   const RhsScalar* rhs, Index rhsIncr,
@@ -545,7 +565,6 @@ EIGEN_DONT_INLINE static void run(
 
   #undef _EIGEN_ACCUMULATE_PACKETS
 }
-};
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
index e9de6af3e..1cb9fe6b5 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -53,7 +53,7 @@ struct general_matrix_vector_product_gemv :
 #define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
-static EIGEN_DONT_INLINE void run( \
+static void run( \
   Index rows, Index cols, \
   const Scalar* lhs, Index lhsStride, \
   const Scalar* rhs, Index rhsIncr, \
@@ -70,7 +70,7 @@ static EIGEN_DONT_INLINE void run( \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
 struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
-static EIGEN_DONT_INLINE void run( \
+static void run( \
   Index rows, Index cols, \
   const Scalar* lhs, Index lhsStride, \
   const Scalar* rhs, Index rhsIncr, \
@@ -92,7 +92,7 @@ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,Conjugat
 { \
 typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
 \
-static EIGEN_DONT_INLINE void run( \
+static void run( \
   Index rows, Index cols, \
   const EIGTYPE* lhs, Index lhsStride, \
   const EIGTYPE* rhs, Index rhsIncr, \
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index e4b21dedd..ee619df99 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -230,6 +230,17 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
 {
 
   static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
@@ -301,7 +312,6 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
       }
     }
   }
-};
 
 // matrix * selfadjoint product
 template <typename Scalar, typename Index,
@@ -311,6 +321,17 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
 {
 
   static EIGEN_DONT_INLINE void run(
+    Index rows, Index cols,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
+    const Scalar& alpha);
+};
+
+template <typename Scalar, typename Index,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
@@ -353,7 +374,6 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
       }
     }
   }
-};
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
index 4e5c4125c..dfa687fef 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
@@ -23,7 +23,7 @@
  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+//
  ********************************************************************************
  *   Content : Eigen bindings to Intel(R) MKL
  *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
@@ -47,7 +47,7 @@ template <typename Index, \
 struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
 {\
 \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index rows, Index cols, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
@@ -98,7 +98,7 @@ template <typename Index, \
           int RhsStorageOrder, bool ConjugateRhs> \
 struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
 {\
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index rows, Index cols, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
@@ -174,7 +174,7 @@ template <typename Index, \
 struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
 {\
 \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index rows, Index cols, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
@@ -224,7 +224,7 @@ template <typename Index, \
           int RhsStorageOrder, bool ConjugateRhs> \
 struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
 {\
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index rows, Index cols, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h
index 6d736f01c..7c9e3fc98 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -28,6 +28,15 @@ struct selfadjoint_matrix_vector_product
 
 {
 static EIGEN_DONT_INLINE void run(
+  Index size,
+  const Scalar*  lhs, Index lhsStride,
+  const Scalar* _rhs, Index rhsIncr,
+  Scalar* res,
+  Scalar alpha);
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar* _rhs, Index rhsIncr,
@@ -153,7 +162,6 @@ static EIGEN_DONT_INLINE void run(
     res[j] += alpha * t2;
   }
 }
-};
 
 } // end namespace internal 
 
diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
index f88d483b6..86684b66d 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -50,7 +50,7 @@ struct selfadjoint_matrix_vector_product_symv :
 #define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
 template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
 struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
-static EIGEN_DONT_INLINE void run( \
+static void run( \
   Index size, const Scalar*  lhs, Index lhsStride, \
   const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
     enum {\
@@ -77,7 +77,7 @@ struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,Co
 { \
 typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
 \
-static EIGEN_DONT_INLINE void run( \
+static void run( \
 Index size, const EIGTYPE*  lhs, Index lhsStride, \
 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
 { \
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index c4bc2aa6f..8110507b5 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -96,6 +96,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -203,15 +216,14 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       }
     }
   }
-};
 
 // implements col-major += alpha * op(general) * op(triangular)
 template <typename Scalar, typename Index, int Mode,
           int LhsStorageOrder, bool ConjugateLhs,
           int RhsStorageOrder, bool ConjugateRhs, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
-                                           LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+                                        LhsStorageOrder,ConjugateLhs,
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
 {
   typedef gebp_traits<Scalar,Scalar> Traits;
   enum {
@@ -225,6 +237,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
     Scalar* res,        Index resStride,
+    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+};
+
+template <typename Scalar, typename Index, int Mode,
+          int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
+                                                        LhsStorageOrder,ConjugateLhs,
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+    Index _rows, Index _cols, Index _depth,
+    const Scalar* _lhs, Index lhsStride,
+    const Scalar* _rhs, Index rhsStride,
+    Scalar* res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -343,7 +368,6 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       }
     }
   }
-};
 
 /***************************************************************************
 * Wrapper to product_triangular_matrix_matrix
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
index 4d20de617..ba41a1c99 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -91,7 +91,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
     conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
   }; \
 \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index _rows, Index _cols, Index _depth, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
@@ -205,7 +205,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
     conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
   }; \
 \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
     Index _rows, Index _cols, Index _depth, \
     const EIGTYPE* _lhs, Index lhsStride, \
     const EIGTYPE* _rhs, Index rhsStride, \
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 701e283f6..134a91d2d 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -27,7 +27,13 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
   static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index size = (std::min)(_rows,_cols);
@@ -78,7 +84,6 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
           _res, resIncr, alpha);
     }
   }
-};
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
@@ -89,8 +94,14 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
-  static void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                  const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+};
+
+template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
+  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
     Index diagSize = (std::min)(_rows,_cols);
@@ -141,7 +152,6 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
             &res.coeffRef(diagSize), resIncr, alpha);
     }
   }
-};
 
 /***************************************************************************
 * Wrapper to product_triangular_vector
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
index 3c2c3049a..09f110da7 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -50,7 +50,7 @@ struct triangular_matrix_vector_product_trmv :
 #define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
                                      const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
       triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
         _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
@@ -58,7 +58,7 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs
 }; \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
                                      const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
       triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
         _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
@@ -81,8 +81,8 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     LowUp = IsLower ? Lower : Upper \
   }; \
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                             const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
  { \
    if (ConjLhs || IsZeroDiag) { \
      triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
@@ -166,8 +166,8 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     LowUp = IsLower ? Lower : Upper \
   }; \
- static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                             const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
  { \
    if (IsZeroDiag) { \
      triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h
index a49ea3183..f103eae72 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -18,7 +18,7 @@ namespace internal {
 template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
 struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
 {
-  static EIGEN_DONT_INLINE void run(
+  static void run(
     Index size, Index cols,
     const Scalar*  tri, Index triStride,
     Scalar* _other, Index otherStride,
@@ -39,6 +39,13 @@ template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStor
 struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
 {
   static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
     Scalar* _other, Index otherStride,
@@ -173,7 +180,6 @@ struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageO
       }
     }
   }
-};
 
 /* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
  */
@@ -181,6 +187,13 @@ template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStor
 struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
 {
   static EIGEN_DONT_INLINE void run(
+    Index size, Index otherSize,
+    const Scalar* _tri, Index triStride,
+    Scalar* _other, Index otherStride,
+    level3_blocking<Scalar,Scalar>& blocking);
+};
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
     Index size, Index otherSize,
     const Scalar* _tri, Index triStride,
     Scalar* _other, Index otherStride,
@@ -308,7 +321,6 @@ struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorage
       }
     }
   }
-};
 
 } // end namespace internal
 
diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
index a4f508b2e..6a0bb8339 100644
--- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+++ b/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
@@ -48,7 +48,7 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
   }; \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
       Index size, Index otherSize, \
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
@@ -103,7 +103,7 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
   }; \
-  static EIGEN_DONT_INLINE void run( \
+  static void run( \
       Index size, Index otherSize, \
       const EIGTYPE* _tri, Index triStride, \
       EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 5ff01da28..be134d3d3 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -213,7 +213,7 @@ class SparseMatrix
       * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    EIGEN_DONT_INLINE Scalar& insert(Index row, Index col)
+    Scalar& insert(Index row, Index col)
     {
       if(isCompressed())
       {
@@ -434,7 +434,7 @@ class SparseMatrix
     
     /** \internal
       * same as insert(Index,Index) except that the indices are given relative to the storage order */
-    EIGEN_DONT_INLINE Scalar& insertByOuterInner(Index j, Index i)
+    Scalar& insertByOuterInner(Index j, Index i)
     {
       return insert(IsRowMajor ? j : i, IsRowMajor ? i : j);
     }
@@ -711,62 +711,7 @@ class SparseMatrix
     #endif
 
     template<typename OtherDerived>
-    EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      if (needToTranspose)
-      {
-        // two passes algorithm:
-        //  1 - compute the number of coeffs per dest inner vector
-        //  2 - do the actual copy/eval
-        // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
-        typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
-        typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
-        OtherCopy otherCopy(other.derived());
-
-        SparseMatrix dest(other.rows(),other.cols());
-        Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
-
-        // pass 1
-        // FIXME the above copy could be merged with that pass
-        for (Index j=0; j<otherCopy.outerSize(); ++j)
-          for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
-            ++dest.m_outerIndex[it.index()];
-
-        // prefix sum
-        Index count = 0;
-        VectorXi positions(dest.outerSize());
-        for (Index j=0; j<dest.outerSize(); ++j)
-        {
-          Index tmp = dest.m_outerIndex[j];
-          dest.m_outerIndex[j] = count;
-          positions[j] = count;
-          count += tmp;
-        }
-        dest.m_outerIndex[dest.outerSize()] = count;
-        // alloc
-        dest.m_data.resize(count);
-        // pass 2
-        for (Index j=0; j<otherCopy.outerSize(); ++j)
-        {
-          for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
-          {
-            Index pos = positions[it.index()]++;
-            dest.m_data.index(pos) = j;
-            dest.m_data.value(pos) = it.value();
-          }
-        }
-        this->swap(dest);
-        return *this;
-      }
-      else
-      {
-        if(other.isRValue())
-          initAssignment(other.derived());
-        // there is no special optimization
-        return Base::operator=(other.derived());
-      }
-    }
+    EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
 
     friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
     {
@@ -836,111 +781,7 @@ protected:
 
     /** \internal
       * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col)
-    {
-      eigen_assert(isCompressed());
-
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index previousOuter = outer;
-      if (m_outerIndex[outer+1]==0)
-      {
-        // we start a new inner vector
-        while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
-        {
-          m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
-          --previousOuter;
-        }
-        m_outerIndex[outer+1] = m_outerIndex[outer];
-      }
-
-      // here we have to handle the tricky case where the outerIndex array
-      // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
-      // the 2nd inner vector...
-      bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
-                    && (size_t(m_outerIndex[outer+1]) == m_data.size());
-
-      size_t startId = m_outerIndex[outer];
-      // FIXME let's make sure sizeof(long int) == sizeof(size_t)
-      size_t p = m_outerIndex[outer+1];
-      ++m_outerIndex[outer+1];
-
-      float reallocRatio = 1;
-      if (m_data.allocatedSize()<=m_data.size())
-      {
-        // if there is no preallocated memory, let's reserve a minimum of 32 elements
-        if (m_data.size()==0)
-        {
-          m_data.reserve(32);
-        }
-        else
-        {
-          // we need to reallocate the data, to reduce multiple reallocations
-          // we use a smart resize algorithm based on the current filling ratio
-          // in addition, we use float to avoid integers overflows
-          float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer+1);
-          reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
-          // furthermore we bound the realloc ratio to:
-          //   1) reduce multiple minor realloc when the matrix is almost filled
-          //   2) avoid to allocate too much memory when the matrix is almost empty
-          reallocRatio = (std::min)((std::max)(reallocRatio,1.5f),8.f);
-        }
-      }
-      m_data.resize(m_data.size()+1,reallocRatio);
-
-      if (!isLastVec)
-      {
-        if (previousOuter==-1)
-        {
-          // oops wrong guess.
-          // let's correct the outer offsets
-          for (Index k=0; k<=(outer+1); ++k)
-            m_outerIndex[k] = 0;
-          Index k=outer+1;
-          while(m_outerIndex[k]==0)
-            m_outerIndex[k++] = 1;
-          while (k<=m_outerSize && m_outerIndex[k]!=0)
-            m_outerIndex[k++]++;
-          p = 0;
-          --k;
-          k = m_outerIndex[k]-1;
-          while (k>0)
-          {
-            m_data.index(k) = m_data.index(k-1);
-            m_data.value(k) = m_data.value(k-1);
-            k--;
-          }
-        }
-        else
-        {
-          // we are not inserting into the last inner vec
-          // update outer indices:
-          Index j = outer+2;
-          while (j<=m_outerSize && m_outerIndex[j]!=0)
-            m_outerIndex[j++]++;
-          --j;
-          // shift data of last vecs:
-          Index k = m_outerIndex[j]-1;
-          while (k>=Index(p))
-          {
-            m_data.index(k) = m_data.index(k-1);
-            m_data.value(k) = m_data.value(k-1);
-            k--;
-          }
-        }
-      }
-
-      while ( (p > startId) && (m_data.index(p-1) > inner) )
-      {
-        m_data.index(p) = m_data.index(p-1);
-        m_data.value(p) = m_data.value(p-1);
-        --p;
-      }
-
-      m_data.index(p) = inner;
-      return (m_data.value(p) = 0);
-    }
+    EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
 
     /** \internal
       * A vector object that is equal to 0 everywhere but v at the position i */
@@ -959,36 +800,7 @@ protected:
 
     /** \internal
       * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col)
-    {
-      eigen_assert(!isCompressed());
-
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      std::ptrdiff_t room = m_outerIndex[outer+1] - m_outerIndex[outer];
-      std::ptrdiff_t innerNNZ = m_innerNonZeros[outer];
-      if(innerNNZ>=room)
-      {
-        // this inner vector is full, we need to reallocate the whole buffer :(
-        reserve(SingletonVector(outer,std::max<std::ptrdiff_t>(2,innerNNZ)));
-      }
-
-      Index startId = m_outerIndex[outer];
-      Index p = startId + m_innerNonZeros[outer];
-      while ( (p > startId) && (m_data.index(p-1) > inner) )
-      {
-        m_data.index(p) = m_data.index(p-1);
-        m_data.value(p) = m_data.value(p-1);
-        --p;
-      }
-      eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
-
-      m_innerNonZeros[outer]++;
-
-      m_data.index(p) = inner;
-      return (m_data.value(p) = 0);
-    }
+    EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
 
 public:
     /** \internal
@@ -1205,6 +1017,204 @@ void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
   m_data.resize(m_outerIndex[m_outerSize]);
 }
 
+template<typename Scalar, int _Options, typename _Index>
+template<typename OtherDerived>
+EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
+{
+  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+  if (needToTranspose)
+  {
+    // two passes algorithm:
+    //  1 - compute the number of coeffs per dest inner vector
+    //  2 - do the actual copy/eval
+    // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
+    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
+    typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    OtherCopy otherCopy(other.derived());
+
+    SparseMatrix dest(other.rows(),other.cols());
+    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+
+    // pass 1
+    // FIXME the above copy could be merged with that pass
+    for (Index j=0; j<otherCopy.outerSize(); ++j)
+      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+        ++dest.m_outerIndex[it.index()];
+
+    // prefix sum
+    Index count = 0;
+    VectorXi positions(dest.outerSize());
+    for (Index j=0; j<dest.outerSize(); ++j)
+    {
+      Index tmp = dest.m_outerIndex[j];
+      dest.m_outerIndex[j] = count;
+      positions[j] = count;
+      count += tmp;
+    }
+    dest.m_outerIndex[dest.outerSize()] = count;
+    // alloc
+    dest.m_data.resize(count);
+    // pass 2
+    for (Index j=0; j<otherCopy.outerSize(); ++j)
+    {
+      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      {
+        Index pos = positions[it.index()]++;
+        dest.m_data.index(pos) = j;
+        dest.m_data.value(pos) = it.value();
+      }
+    }
+    this->swap(dest);
+    return *this;
+  }
+  else
+  {
+    if(other.isRValue())
+      initAssignment(other.derived());
+    // there is no special optimization
+    return Base::operator=(other.derived());
+  }
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
+{
+  eigen_assert(!isCompressed());
+
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+
+  std::ptrdiff_t room = m_outerIndex[outer+1] - m_outerIndex[outer];
+  std::ptrdiff_t innerNNZ = m_innerNonZeros[outer];
+  if(innerNNZ>=room)
+  {
+    // this inner vector is full, we need to reallocate the whole buffer :(
+    reserve(SingletonVector(outer,std::max<std::ptrdiff_t>(2,innerNNZ)));
+  }
+
+  Index startId = m_outerIndex[outer];
+  Index p = startId + m_innerNonZeros[outer];
+  while ( (p > startId) && (m_data.index(p-1) > inner) )
+  {
+    m_data.index(p) = m_data.index(p-1);
+    m_data.value(p) = m_data.value(p-1);
+    --p;
+  }
+  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
+
+  m_innerNonZeros[outer]++;
+
+  m_data.index(p) = inner;
+  return (m_data.value(p) = 0);
+}
+
+template<typename _Scalar, int _Options, typename _Index>
+EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
+{
+  eigen_assert(isCompressed());
+
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+
+  Index previousOuter = outer;
+  if (m_outerIndex[outer+1]==0)
+  {
+    // we start a new inner vector
+    while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
+    {
+      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
+      --previousOuter;
+    }
+    m_outerIndex[outer+1] = m_outerIndex[outer];
+  }
+
+  // here we have to handle the tricky case where the outerIndex array
+  // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
+  // the 2nd inner vector...
+  bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
+                && (size_t(m_outerIndex[outer+1]) == m_data.size());
+
+  size_t startId = m_outerIndex[outer];
+  // FIXME let's make sure sizeof(long int) == sizeof(size_t)
+  size_t p = m_outerIndex[outer+1];
+  ++m_outerIndex[outer+1];
+
+  float reallocRatio = 1;
+  if (m_data.allocatedSize()<=m_data.size())
+  {
+    // if there is no preallocated memory, let's reserve a minimum of 32 elements
+    if (m_data.size()==0)
+    {
+      m_data.reserve(32);
+    }
+    else
+    {
+      // we need to reallocate the data, to reduce multiple reallocations
+      // we use a smart resize algorithm based on the current filling ratio
+      // in addition, we use float to avoid integers overflows
+      float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer+1);
+      reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
+      // furthermore we bound the realloc ratio to:
+      //   1) reduce multiple minor realloc when the matrix is almost filled
+      //   2) avoid to allocate too much memory when the matrix is almost empty
+      reallocRatio = (std::min)((std::max)(reallocRatio,1.5f),8.f);
+    }
+  }
+  m_data.resize(m_data.size()+1,reallocRatio);
+
+  if (!isLastVec)
+  {
+    if (previousOuter==-1)
+    {
+      // oops wrong guess.
+      // let's correct the outer offsets
+      for (Index k=0; k<=(outer+1); ++k)
+        m_outerIndex[k] = 0;
+      Index k=outer+1;
+      while(m_outerIndex[k]==0)
+        m_outerIndex[k++] = 1;
+      while (k<=m_outerSize && m_outerIndex[k]!=0)
+        m_outerIndex[k++]++;
+      p = 0;
+      --k;
+      k = m_outerIndex[k]-1;
+      while (k>0)
+      {
+        m_data.index(k) = m_data.index(k-1);
+        m_data.value(k) = m_data.value(k-1);
+        k--;
+      }
+    }
+    else
+    {
+      // we are not inserting into the last inner vec
+      // update outer indices:
+      Index j = outer+2;
+      while (j<=m_outerSize && m_outerIndex[j]!=0)
+        m_outerIndex[j++]++;
+      --j;
+      // shift data of last vecs:
+      Index k = m_outerIndex[j]-1;
+      while (k>=Index(p))
+      {
+        m_data.index(k) = m_data.index(k-1);
+        m_data.value(k) = m_data.value(k-1);
+        k--;
+      }
+    }
+  }
+
+  while ( (p > startId) && (m_data.index(p-1) > inner) )
+  {
+    m_data.index(p) = m_data.index(p-1);
+    m_data.value(p) = m_data.value(p-1);
+    --p;
+  }
+
+  m_data.index(p) = inner;
+  return (m_data.value(p) = 0);
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEMATRIX_H
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index a9c8979cf..5bfd041ed 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -309,30 +309,7 @@ class SparseVector
 
 protected:
     template<typename OtherDerived>
-    EIGEN_DONT_INLINE SparseVector& assign(const SparseMatrixBase<OtherDerived>& _other)
-    {
-      const OtherDerived& other(_other.derived());
-      const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      if(needToTranspose)
-      {
-        Index size = other.size();
-        Index nnz = other.nonZeros();
-        resize(size);
-        reserve(nnz);
-        for(Index i=0; i<size; ++i)
-        {
-          typename OtherDerived::InnerIterator it(other, i);
-          if(it)
-              insert(i) = it.value();
-        }
-        return *this;
-      }
-      else
-      {
-        // there is no special optimization
-        return Base::operator=(other);
-      }
-    }
+    EIGEN_DONT_INLINE SparseVector& assign(const SparseMatrixBase<OtherDerived>& _other);
     
     Storage m_data;
     Index m_size;
@@ -402,6 +379,33 @@ class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
     const Index m_start;
 };
 
+template<typename Scalar, int _Options, typename _Index>
+template<typename OtherDerived>
+EIGEN_DONT_INLINE SparseVector<Scalar,_Options,_Index>& SparseVector<Scalar,_Options,_Index>::assign(const SparseMatrixBase<OtherDerived>& _other)
+{
+  const OtherDerived& other(_other.derived());
+  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+  if(needToTranspose)
+  {
+    Index size = other.size();
+    Index nnz = other.nonZeros();
+    resize(size);
+    reserve(nnz);
+    for(Index i=0; i<size; ++i)
+    {
+      typename OtherDerived::InnerIterator it(other, i);
+      if(it)
+          insert(i) = it.value();
+    }
+    return *this;
+  }
+  else
+  {
+    // there is no special optimization
+    return Base::operator=(other);
+  }
+}
+    
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEVECTOR_H