* added innerSize / outerSize functions to MatrixBase

* added complete implementation of sparse matrix product (with a little glue in Eigen/Core) * added an exhaustive bench of sparse products including GMM++ and MTL4 => Eigen outperforms in all transposed/density configurations !
2025-09-17 20:03:17 +08:00 · 2008-06-28 23:07:14 +00:00 · 2008-06-28 23:07:14 +00:00 · 027818d739
commit 027818d739
parent 6917be9113
16 changed files with 622 additions and 380 deletions
--- a/Eigen/Sparse
+++ b/Eigen/Sparse
@ -11,13 +11,13 @@ namespace Eigen {

 #include "src/Sparse/SparseUtil.h"
 #include "src/Sparse/SparseMatrixBase.h"
-#include "src/Sparse/SparseProduct.h"
 #include "src/Sparse/SparseArray.h"
 #include "src/Sparse/SparseMatrix.h"
 #include "src/Sparse/HashMatrix.h"
 #include "src/Sparse/LinkedVectorMatrix.h"
 #include "src/Sparse/CoreIterators.h"
 #include "src/Sparse/SparseSetter.h"
+#include "src/Sparse/SparseProduct.h"

 } // namespace Eigen

--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -204,8 +204,8 @@ struct ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>
  static void run(Derived1 &dst, const Derived2 &src)
  {
    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? dst.cols() : dst.rows();
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int innerSize = dst.innerSize();
+    const int outerSize = dst.outerSize();
    for(int j = 0; j < outerSize; j++)
      for(int i = 0; i < innerSize; i++)
      {
@ -233,7 +233,7 @@ struct ei_assign_impl<Derived1, Derived2, NoVectorization, InnerUnrolling>
  {
    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int outerSize = dst.outerSize();
    for(int j = 0; j < outerSize; j++)
      ei_assign_novec_InnerUnrolling<Derived1, Derived2, 0, innerSize>
        ::run(dst, src, j);
@ -250,8 +250,8 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, NoUnrolling>
  static void run(Derived1 &dst, const Derived2 &src)
  {
    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
-    const int innerSize = rowMajor ? dst.cols() : dst.rows();
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int innerSize = dst.innerSize();
+    const int outerSize = dst.outerSize();
    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
    for(int j = 0; j < outerSize; j++)
    {
@ -282,7 +282,7 @@ struct ei_assign_impl<Derived1, Derived2, InnerVectorization, InnerUnrolling>
  {
    const bool rowMajor = int(Derived1::Flags)&RowMajorBit;
    const int innerSize = rowMajor ? Derived1::ColsAtCompileTime : Derived1::RowsAtCompileTime;
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int outerSize = dst.outerSize();
    for(int j = 0; j < outerSize; j++)
      ei_assign_innervec_InnerUnrolling<Derived1, Derived2, 0, innerSize>
        ::run(dst, src, j);
@ -337,8 +337,8 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
  {
    const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
    const bool rowMajor = Derived1::Flags&RowMajorBit;
-    const int innerSize = rowMajor ? dst.cols() : dst.rows();
-    const int outerSize = rowMajor ? dst.rows() : dst.cols();
+    const int innerSize = dst.innerSize();
+    const int outerSize = dst.outerSize();
    const int alignedInnerSize = (innerSize/packetSize)*packetSize;

    for(int i = 0; i < outerSize; i++)
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -154,11 +154,20 @@ template<typename Derived> class MatrixBase
    /** \returns the number of coefficients, which is \a rows()*cols().
      * \sa rows(), cols(), SizeAtCompileTime. */
    inline int size() const { return rows() * cols(); }
+    /** \returns the number of nonzero coefficients which is in practice the number
+      * of stored coefficients. */
+    inline int nonZeros() const { return derived.nonZeros(); }
    /** \returns true if either the number of rows or the number of columns is equal to 1.
      * In other words, this function returns
      * \code rows()==1 || cols()==1 \endcode
      * \sa rows(), cols(), IsVectorAtCompileTime. */
    inline bool isVector() const { return rows()==1 || cols()==1; }
+    /** \returns the size of the storage major dimension,
+      * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+    int outerSize() const { return (int(Flags)&RowMajorBit) ? this->rows() : this->cols(); }
+    /** \returns the size of the inner dimension according to the storage order,
+      * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+    int innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }

    /** Represents a constant matrix */
    typedef CwiseNullaryOp<ei_scalar_constant_op<Scalar>,Derived> ConstantReturnType;
@ -206,6 +215,10 @@ template<typename Derived> class MatrixBase
    template<typename Derived1, typename Derived2>
    Derived& lazyAssign(const Product<Derived1,Derived2,CacheFriendlyProduct>& product);

+    /** Overloaded for sparse product evaluation */
+    template<typename Derived1, typename Derived2>
+    Derived& lazyAssign(const Product<Derived1,Derived2,SparseProduct>& product);
+
    CommaInitializer operator<< (const Scalar& s);

    template<typename OtherDerived>
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -92,6 +92,8 @@ template<typename Lhs, typename Rhs> struct ei_product_mode
 {
  enum{ value = ((Rhs::Flags&Diagonal)==Diagonal) || ((Lhs::Flags&Diagonal)==Diagonal)
              ? DiagonalProduct
+              : (Rhs::Flags & Lhs::Flags & SparseBit)
+              ? SparseProduct
              :    Lhs::MaxRowsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
                && Rhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
                && Lhs::MaxColsAtCompileTime >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
@ -147,8 +149,7 @@ struct ei_traits<Product<LhsNested, RhsNested, ProductMode> >
    CanVectorizeInner = LhsRowMajor && (!RhsRowMajor) && (LhsFlags & PacketAccessBit) && (RhsFlags & PacketAccessBit)
                      && (InnerSize!=Dynamic) && (InnerSize % ei_packet_traits<Scalar>::size == 0),

-    EvalToRowMajor = (RhsFlags & RowMajorBit)
-                   && (ProductMode==(int)CacheFriendlyProduct ? (int)LhsFlags & RowMajorBit : (!CanVectorizeLhs)),
+    EvalToRowMajor = RhsRowMajor && (ProductMode==(int)CacheFriendlyProduct ? LhsRowMajor : (!CanVectorizeLhs)),

    RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -71,7 +71,7 @@ template<typename MatrixType> class Transpose

    inline int rows() const { return m_matrix.cols(); }
    inline int cols() const { return m_matrix.rows(); }
-
+    inline int nonZeros() const { return m_matrix.nonZeros(); }
    inline int stride(void) const { return m_matrix.stride(); }

    inline Scalar& coeffRef(int row, int col)
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@ -211,6 +211,11 @@ template<typename T> struct ei_unref<T&> { typedef T type; };
 template<typename T> struct ei_unconst { typedef T type; };
 template<typename T> struct ei_unconst<const T> { typedef T type; };

+template<typename T> struct ei_cleantype { typedef T type; };
+template<typename T> struct ei_cleantype<const T> { typedef T type; };
+template<typename T> struct ei_cleantype<const T&> { typedef T type; };
+template<typename T> struct ei_cleantype<T&> { typedef T type; };
+
 template<typename T> struct ei_must_nest_by_value { enum { ret = false }; };
 template<typename T> struct ei_must_nest_by_value<NestByValue<T> > { enum { ret = true }; };

--- a/Eigen/src/Sparse/HashMatrix.h
+++ b/Eigen/src/Sparse/HashMatrix.h
@ -34,7 +34,7 @@ struct ei_traits<HashMatrix<_Scalar, _Flags> >
    ColsAtCompileTime = Dynamic,
    MaxRowsAtCompileTime = Dynamic,
    MaxColsAtCompileTime = Dynamic,
-    Flags = _Flags,
+    Flags = SparseBit | _Flags,
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
    SupportedAccessPatterns = RandomAccessPattern
  };
--- a/Eigen/src/Sparse/LinkedVectorMatrix.h
+++ b/Eigen/src/Sparse/LinkedVectorMatrix.h
@ -34,7 +34,7 @@ struct ei_traits<LinkedVectorMatrix<_Scalar,_Flags> >
    ColsAtCompileTime = Dynamic,
    MaxRowsAtCompileTime = Dynamic,
    MaxColsAtCompileTime = Dynamic,
-    Flags = _Flags,
+    Flags = SparseBit | _Flags,
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
    SupportedAccessPatterns = InnerCoherentAccessPattern
  };
--- a/Eigen/src/Sparse/SparseMatrix.h
+++ b/Eigen/src/Sparse/SparseMatrix.h
@ -43,7 +43,7 @@ struct ei_traits<SparseMatrix<_Scalar, _Flags> >
    ColsAtCompileTime = Dynamic,
    MaxRowsAtCompileTime = Dynamic,
    MaxColsAtCompileTime = Dynamic,
-    Flags = _Flags,
+    Flags = SparseBit | _Flags,
    CoeffReadCost = NumTraits<Scalar>::ReadCost,
    SupportedAccessPatterns = FullyCoherentAccessPattern
  };
@ -68,8 +68,9 @@ class SparseMatrix : public SparseMatrixBase<SparseMatrix<_Scalar, _Flags> >

    inline int rows() const { return m_rows; }
    inline int cols() const { return m_cols; }
+    inline int innerNonZeros(int j) const { return m_colPtrs[j+1]-m_colPtrs[j]; }

-    inline const Scalar& coeff(int row, int col) const
+    inline Scalar coeff(int row, int col) const
    {
      int id = m_colPtrs[col];
      int end = m_colPtrs[col+1];
@ -161,6 +162,13 @@ class SparseMatrix : public SparseMatrixBase<SparseMatrix<_Scalar, _Flags> >
      resize(rows, cols);
    }

+    template<typename OtherDerived>
+    inline SparseMatrix(const MatrixBase<OtherDerived>& other)
+      : m_rows(0), m_cols(0), m_colPtrs(0)
+    {
+      *this = other.derived();
+    }
+
    inline void shallowCopy(const SparseMatrix& other)
    {
      EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: shallowCopy\n");
@ -192,15 +200,7 @@ class SparseMatrix : public SparseMatrixBase<SparseMatrix<_Scalar, _Flags> >
    template<typename OtherDerived>
    inline SparseMatrix& operator=(const MatrixBase<OtherDerived>& other)
    {
-      return SparseMatrixBase<SparseMatrix>::operator=(other);
-    }
-
-    template<typename OtherDerived>
-    SparseMatrix<Scalar> operator*(const MatrixBase<OtherDerived>& other)
-    {
-      SparseMatrix<Scalar> res(rows(), other.cols());
-      ei_sparse_product<SparseMatrix,OtherDerived>(*this,other.derived(),res);
-      return res;
+      return SparseMatrixBase<SparseMatrix>::operator=(other.derived());
    }

    friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
--- a/Eigen/src/Sparse/SparseMatrixBase.h
+++ b/Eigen/src/Sparse/SparseMatrixBase.h
@ -49,9 +49,6 @@ class SparseMatrixBase : public MatrixBase<Derived>
    bool isRValue() const { return m_isRValue; }
    Derived& temporary() { m_isRValue = true; return derived(); }

-    int outerSize() const { return RowMajor ? this->rows() : this->cols(); }
-    int innerSize() const { return RowMajor ? this->cols() : this->rows(); }
-
    inline Derived& operator=(const Derived& other)
    {
      if (other.isRValue())
@ -64,31 +61,27 @@ class SparseMatrixBase : public MatrixBase<Derived>
      return derived();
    }

-
-
    template<typename OtherDerived>
    inline Derived& operator=(const MatrixBase<OtherDerived>& other)
    {
      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      const int outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
-      // eval to a temporary and then do a shallow copy
-      typename ei_meta_if<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::ret temp(other.rows(), other.cols());
+      const int outerSize = other.outerSize();
+      typedef typename ei_meta_if<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::ret TempType;
+      TempType temp(other.rows(), other.cols());

      temp.startFill(std::max(this->rows(),this->cols())*2);
-//       std::cout << other.rows() << " xm " << other.cols() << "\n";
      for (int j=0; j<outerSize; ++j)
      {
        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
        {
-//           std::cout << other.rows() << " x " << other.cols() << "\n";
-//           std::cout << it.m_matrix.rows() << "\n";
          Scalar v = it.value();
          if (v!=Scalar(0))
-            if (RowMajor) temp.fill(j,it.index()) = v;
+            if (OtherDerived::Flags & RowMajorBit) temp.fill(j,it.index()) = v;
            else temp.fill(it.index(),j) = v;
        }
      }
      temp.endFill();
+
      derived() = temp.temporary();
      return derived();
    }
@ -97,6 +90,7 @@ class SparseMatrixBase : public MatrixBase<Derived>
    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
    {
      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+//       std::cout << "eval transpose = " << transpose << "\n";
      const int outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
      if ((!transpose) && other.isRValue())
      {
@ -120,12 +114,15 @@ class SparseMatrixBase : public MatrixBase<Derived>
      return derived();
    }

+    template<typename Lhs, typename Rhs>
+    inline Derived& operator=(const Product<Lhs,Rhs,SparseProduct>& product);
+
    friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
    {

      if (Flags&RowMajorBit)
      {
-        for (int row=0; row<m.rows(); ++row)
+        for (int row=0; row<m.outerSize(); ++row)
        {
          int col = 0;
          for (typename Derived::InnerIterator it(m.derived(), row); it; ++it)
@ -158,6 +155,4 @@ class SparseMatrixBase : public MatrixBase<Derived>
    mutable bool m_hasBeenCopied;
 };

-
-
 #endif // EIGEN_SPARSEMATRIXBASE_H
--- a/Eigen/src/Sparse/SparseProduct.h
+++ b/Eigen/src/Sparse/SparseProduct.h
@ -25,145 +25,308 @@
 #ifndef EIGEN_SPARSEPRODUCT_H
 #define EIGEN_SPARSEPRODUCT_H

-#define DENSE_TMP 1
-#define MAP_TMP 2
-#define LIST_TMP 3
-
-#define TMP_TMP 3
-
-template<typename Scalar>
-struct ListEl
+// sparse product return type specialization
+template<typename Lhs, typename Rhs>
+struct ProductReturnType<Lhs,Rhs,SparseProduct>
 {
-  int next;
-  int index;
-  Scalar value;
+  typedef typename ei_traits<Lhs>::Scalar Scalar;
+  enum {
+    LhsRowMajor = ei_traits<Lhs>::Flags & RowMajorBit,
+    RhsRowMajor = ei_traits<Rhs>::Flags & RowMajorBit,
+    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
+    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
+  };
+
+  // FIXME if we transpose let's evaluate to a LinkedVectorMatrix since it is the
+  // type of the temporary to perform the transpose op
+  typedef typename ei_meta_if<TransposeLhs,
+    SparseMatrix<Scalar,0>,
+    typename ei_nested<Lhs,Rhs::RowsAtCompileTime>::type>::ret LhsNested;
+
+  typedef typename ei_meta_if<TransposeRhs,
+    SparseMatrix<Scalar,0>,
+    typename ei_nested<Rhs,Lhs::RowsAtCompileTime>::type>::ret RhsNested;
+
+  typedef Product<typename ei_unconst<LhsNested>::type,
+                  typename ei_unconst<RhsNested>::type, SparseProduct> Type;
 };

-template<typename Lhs, typename Rhs>
-static void ei_sparse_product(const Lhs& lhs, const Rhs& rhs, SparseMatrix<typename ei_traits<Lhs>::Scalar>& res)
+template<typename LhsNested, typename RhsNested>
+struct ei_traits<Product<LhsNested, RhsNested, SparseProduct> >
 {
-  int rows = lhs.rows();
-  int cols = rhs.rows();
-  int size = lhs.cols();
+  // clean the nested types:
+  typedef typename ei_unconst<typename ei_unref<LhsNested>::type>::type _LhsNested;
+  typedef typename ei_unconst<typename ei_unref<RhsNested>::type>::type _RhsNested;
+  typedef typename _LhsNested::Scalar Scalar;

-  float ratio = std::max(float(lhs.nonZeros())/float(lhs.rows()*lhs.cols()), float(rhs.nonZeros())/float(rhs.rows()*rhs.cols()));
-  std::cout << ratio << "\n";
+  enum {
+    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+    LhsFlags = _LhsNested::Flags,
+    RhsFlags = _RhsNested::Flags,

-  ei_assert(size == rhs.rows());
-  typedef typename ei_traits<Lhs>::Scalar Scalar;
-  #if (TMP_TMP == MAP_TMP)
-  std::map<int,Scalar> tmp;
-  #elif (TMP_TMP == LIST_TMP)
-  std::vector<ListEl<Scalar> > tmp(2*rows);
-  #else
-  std::vector<Scalar> tmp(rows);
-  #endif
-  res.resize(rows, cols);
-  res.startFill(2*std::max(rows, cols));
-  for (int j=0; j<cols; ++j)
-  {
-    #if (TMP_TMP == MAP_TMP)
-    tmp.clear();
-    #elif (TMP_TMP == LIST_TMP)
-    int tmp_size = 0;
-    int tmp_start = -1;
-    #else
-    for (int k=0; k<rows; ++k)
-      tmp[k] = 0;
-    #endif
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+    ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+    InnerSize = EIGEN_ENUM_MIN(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
+
+    RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit)
+                | ((RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic) ? 0 : LargeBit)),
+
+    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+          | EvalBeforeAssigningBit
+          | EvalBeforeNestingBit,
+
+    CoeffReadCost = Dynamic
+  };
+};
+
+template<typename LhsNested, typename RhsNested> class Product<LhsNested,RhsNested,SparseProduct> : ei_no_assignment_operator,
+  public MatrixBase<Product<LhsNested, RhsNested, SparseProduct> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+  private:
+
+    typedef typename ei_traits<Product>::_LhsNested _LhsNested;
+    typedef typename ei_traits<Product>::_RhsNested _RhsNested;
+
+  public:
+
+    template<typename Lhs, typename Rhs>
+    inline Product(const Lhs& lhs, const Rhs& rhs)
+      : m_lhs(lhs), m_rhs(rhs)
    {
-      #if (TMP_TMP == MAP_TMP)
-      typename std::map<int,Scalar>::iterator hint = tmp.begin();
-      typename std::map<int,Scalar>::iterator r;
-      #elif (TMP_TMP == LIST_TMP)
-      int tmp_el = tmp_start;
-      #endif
-      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
-      {
-        #if (TMP_TMP == MAP_TMP)
-        r = hint;
-        Scalar v = lhsIt.value() * rhsIt.value();
-        int id = lhsIt.index();
-        while (r!=tmp.end() && r->first < id)
-          ++r;
-        if (r!=tmp.end() && r->first==id)
-        {
-          r->second += v;
-          hint = r;
-        }
-        else
-          hint = tmp.insert(r, std::pair<int,Scalar>(id, v));
-        ++hint;
-        #elif (TMP_TMP == LIST_TMP)
-        Scalar v = lhsIt.value() * rhsIt.value();
-        int id = lhsIt.index();
-        if (tmp_size==0)
-        {
-          tmp_start = 0;
-          tmp_el = 0;
-          tmp_size++;
-          tmp[0].value = v;
-          tmp[0].index = id;
-          tmp[0].next = -1;
-        }
-        else if (id<tmp[tmp_start].index)
-        {
-          tmp[tmp_size].value = v;
-          tmp[tmp_size].index = id;
-          tmp[tmp_size].next = tmp_start;
-          tmp_start = tmp_size;
-          tmp_size++;
-        }
-        else
-        {
-          int nextel = tmp[tmp_el].next;
-          while (nextel >= 0 && tmp[nextel].index<=id)
-          {
-            tmp_el = nextel;
-            nextel = tmp[nextel].next;
-          }
+      ei_assert(lhs.cols() == rhs.rows());
+    }

-          if (tmp[tmp_el].index==id)
+    Scalar coeff(int, int) const { ei_assert(false && "eigen internal error"); }
+    Scalar& coeffRef(int, int) { ei_assert(false && "eigen internal error"); }
+
+    inline int rows() const { return m_lhs.rows(); }
+    inline int cols() const { return m_rhs.cols(); }
+
+    const _LhsNested& lhs() const { return m_lhs; }
+    const _LhsNested& rhs() const { return m_rhs; }
+
+  protected:
+    const LhsNested m_lhs;
+    const RhsNested m_rhs;
+};
+
+const int RowMajor = RowMajorBit;
+const int ColMajor = 0;
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = ei_traits<Lhs>::Flags&RowMajorBit,
+  int RhsStorageOrder = ei_traits<Rhs>::Flags&RowMajorBit,
+  int ResStorageOrder = ei_traits<ResultType>::Flags&RowMajorBit>
+struct ei_sparse_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct ei_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
+{
+  typedef typename ei_traits<typename ei_cleantype<Lhs>::type>::Scalar Scalar;
+
+  struct ListEl
+  {
+    int next;
+    int index;
+    Scalar value;
+  };
+
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    // make sure to call innerSize/outerSize since we fake the storage order.
+    int rows = lhs.innerSize();
+    int cols = rhs.outerSize();
+    int size = lhs.outerSize();
+    ei_assert(size == rhs.rows());
+
+    // allocate a temporary buffer
+    Scalar* buffer = new Scalar[rows];
+
+    // estimate the number of non zero entries
+    float ratioLhs = float(lhs.nonZeros())/float(lhs.rows()*lhs.cols());
+    float avgNnzPerRhsColumn = float(rhs.nonZeros())/float(cols);
+    float ratioRes = std::min(ratioLhs * avgNnzPerRhsColumn, 1.f);
+
+    res.resize(rows, cols);
+    res.startFill(ratioRes*rows*cols);
+    for (int j=0; j<cols; ++j)
+    {
+      // let's do a more accurate determination of the nnz ratio for the current column j of res
+      //float ratioColRes = std::min(ratioLhs * rhs.innerNonZeros(j), 1.f);
+      // FIXME find a nice way to get the number of nonzeros of a sub matrix (here an inner vector)
+      float ratioColRes = ratioRes;
+      if (ratioColRes>0.1)
+      {
+        // dense path, the scalar * columns products are accumulated into a dense column
+        Scalar* __restrict__ tmp = buffer;
+        // set to zero
+        for (int k=0; k<rows; ++k)
+          tmp[k] = 0;
+        for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+        {
+          // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
+          Scalar x = rhsIt.value();
+          for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
          {
-            tmp[tmp_el].value += v;
-          }
-          else
-          {
-            tmp[tmp_size].value = v;
-            tmp[tmp_size].index = id;
-            tmp[tmp_size].next = tmp[tmp_el].next;
-            tmp[tmp_el].next = tmp_size;
-            tmp_size++;
+            tmp[lhsIt.index()] += lhsIt.value() * x;
          }
        }
-        #else
-        tmp[lhsIt.index()] += lhsIt.value() * rhsIt.value();
-        #endif
-        //res.coeffRef(lhsIt.index(), j) += lhsIt.value() * rhsIt.value();
+        // copy the temporary to the respective res.col()
+        for (int k=0; k<rows; ++k)
+          if (tmp[k]!=0)
+            res.fill(k, j) = tmp[k];
+      }
+      else
+      {
+        ListEl* __restrict__ tmp = reinterpret_cast<ListEl*>(buffer);
+        // sparse path, the scalar * columns products are accumulated into a linked list
+        int tmp_size = 0;
+        int tmp_start = -1;
+        for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+        {
+          int tmp_el = tmp_start;
+          for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
+          {
+            Scalar v = lhsIt.value() * rhsIt.value();
+            int id = lhsIt.index();
+            if (tmp_size==0)
+            {
+              tmp_start = 0;
+              tmp_el = 0;
+              tmp_size++;
+              tmp[0].value = v;
+              tmp[0].index = id;
+              tmp[0].next = -1;
+            }
+            else if (id<tmp[tmp_start].index)
+            {
+              tmp[tmp_size].value = v;
+              tmp[tmp_size].index = id;
+              tmp[tmp_size].next = tmp_start;
+              tmp_start = tmp_size;
+              tmp_size++;
+            }
+            else
+            {
+              int nextel = tmp[tmp_el].next;
+              while (nextel >= 0 && tmp[nextel].index<=id)
+              {
+                tmp_el = nextel;
+                nextel = tmp[nextel].next;
+              }
+
+              if (tmp[tmp_el].index==id)
+              {
+                tmp[tmp_el].value += v;
+              }
+              else
+              {
+                tmp[tmp_size].value = v;
+                tmp[tmp_size].index = id;
+                tmp[tmp_size].next = tmp[tmp_el].next;
+                tmp[tmp_el].next = tmp_size;
+                tmp_size++;
+              }
+            }
+          }
+        }
+        int k = tmp_start;
+        while (k>=0)
+        {
+          if (tmp[k].value!=0)
+            res.fill(tmp[k].index, j) = tmp[k].value;
+          k = tmp[k].next;
+        }
      }
    }
-    #if (TMP_TMP == MAP_TMP)
-    for (typename std::map<int,Scalar>::const_iterator k=tmp.begin(); k!=tmp.end(); ++k)
-      if (k->second!=0)
-        res.fill(k->first, j) = k->second;
-    #elif (TMP_TMP == LIST_TMP)
-    int k = tmp_start;
-    while (k>=0)
-    {
-      if (tmp[k].value!=0)
-        res.fill(tmp[k].index, j) = tmp[k].value;
-      k = tmp[k].next;
-    }
-    #else
-    for (int k=0; k<rows; ++k)
-      if (tmp[k]!=0)
-        res.fill(k, j) = tmp[k];
-    #endif
+    res.endFill();
  }
-  res.endFill();
+};

-  std::cout << "  => " << float(res.nonZeros())/float(res.rows()*res.cols()) << "\n";
+template<typename Lhs, typename Rhs, typename ResultType>
+struct ei_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
+{
+  typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    SparseTemporaryType _res(res.rows(), res.cols());
+    ei_sparse_product_selector<Lhs,Rhs,SparseTemporaryType,ColMajor,ColMajor,ColMajor>::run(lhs, rhs, _res);
+    res = _res;
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct ei_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    // let's transpose the product and fake the matrices are column major
+    ei_sparse_product_selector<Rhs,Lhs,ResultType,ColMajor,ColMajor,ColMajor>::run(rhs, lhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct ei_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
+{
+  typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    // let's transpose the product and fake the matrices are column major
+    ei_sparse_product_selector<Rhs,Lhs,ResultType,ColMajor,ColMajor,RowMajor>::run(rhs, lhs, res);
+  }
+};
+
+// NOTE eventually let's transpose one argument even in this case since it might be expensive if
+// the result is not dense.
+// template<typename Lhs, typename Rhs, typename ResultType, int ResStorageOrder>
+// struct ei_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ResStorageOrder>
+// {
+//   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+//   {
+//     // trivial product as lhs.row/rhs.col dot products
+//     // loop over the prefered order of the result
+//   }
+// };
+
+// NOTE the 2 others cases (col row *) must never occurs since they are catched
+// by ProductReturnType which transform it to (col col *) by evaluating rhs.
+
+
+template<typename Derived>
+template<typename Lhs, typename Rhs>
+inline Derived& MatrixBase<Derived>::lazyAssign(const Product<Lhs,Rhs,SparseProduct>& product)
+{
+//   std::cout << "sparse product to dense\n";
+  ei_sparse_product_selector<
+    typename ei_cleantype<Lhs>::type,
+    typename ei_cleantype<Rhs>::type,
+    typename ei_cleantype<Derived>::type>::run(product.lhs(),product.rhs(),derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename Lhs, typename Rhs>
+inline Derived& SparseMatrixBase<Derived>::operator=(const Product<Lhs,Rhs,SparseProduct>& product)
+{
+//   std::cout << "sparse product to sparse\n";
+  ei_sparse_product_selector<
+    typename ei_cleantype<Lhs>::type,
+    typename ei_cleantype<Rhs>::type,
+    Derived>::run(product.lhs(),product.rhs(),derived());
+  return derived();
 }

 #endif // EIGEN_SPARSEPRODUCT_H
--- a/Eigen/src/Sparse/SparseUtil.h
+++ b/Eigen/src/Sparse/SparseUtil.h
@ -31,6 +31,7 @@
 #define EIGEN_DBG_SPARSE(X) X
 #endif

+template<typename Derived> class SparseMatrixBase;
 template<typename _Scalar, int _Flags = 0> class SparseMatrix;
 template<typename _Scalar, int _Flags = 0> class HashMatrix;
 template<typename _Scalar, int _Flags = 0> class LinkedVectorMatrix;
--- a/bench/BenchSparseUtil.h
+++ b/bench/BenchSparseUtil.h
@ -0,0 +1,75 @@
+
+#include <Eigen/Array>
+#include <Eigen/Sparse>
+#include <bench/BenchTimer.h>
+
+
+
+using namespace std;
+using namespace Eigen;
+USING_PART_OF_NAMESPACE_EIGEN
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+#ifndef DENSITY
+#define DENSITY 0.01
+#endif
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+typedef SCALAR Scalar;
+typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+typedef SparseMatrix<Scalar> EigenSparseMatrix;
+
+void fillMatrix(float density, int rows, int cols,  EigenSparseMatrix& dst)
+{
+  dst.startFill(rows*cols*density);
+  for(int j = 0; j < cols; j++)
+  {
+    for(int i = 0; i < rows; i++)
+    {
+      Scalar v = (ei_random<float>(0,1) < density) ? ei_random<Scalar>() : 0;
+      if (v!=0)
+        dst.fill(i,j) = v;
+    }
+  }
+  dst.endFill();
+}
+
+void eiToDense(const EigenSparseMatrix& src, DenseMatrix& dst)
+{
+  dst.setZero();
+  for (int j=0; j<src.cols(); ++j)
+    for (EigenSparseMatrix::InnerIterator it(src.derived(), j); it; ++it)
+      dst(it.index(),j) = it.value();
+}
+
+#ifndef NOGMM
+#include "gmm/gmm.h"
+typedef gmm::csc_matrix<Scalar> GmmSparse;
+typedef gmm::col_matrix< gmm::wsvector<Scalar> > GmmDynSparse;
+void eiToGmm(const EigenSparseMatrix& src, GmmSparse& dst)
+{
+  GmmDynSparse tmp(src.rows(), src.cols());
+  for (int j=0; j<src.cols(); ++j)
+    for (EigenSparseMatrix::InnerIterator it(src.derived(), j); it; ++it)
+      tmp(it.index(),j) = it.value();
+  gmm::copy(tmp, dst);
+}
+#endif
+
+#ifndef NOMTL
+#include <boost/numeric/mtl/mtl.hpp>
+typedef mtl::compressed2D<Scalar> MtlSparse;
+void eiToMtl(const EigenSparseMatrix& src, MtlSparse& dst)
+{
+  mtl::matrix::inserter<MtlSparse> ins(dst);
+  for (int j=0; j<src.cols(); ++j)
+    for (EigenSparseMatrix::InnerIterator it(src.derived(), j); it; ++it)
+      ins[it.index()][j] = it.value();
+}
+#endif
--- a/bench/sparse_01.cpp
+++ b/bench/sparse_01.cpp
@ -1,211 +0,0 @@
-
-// g++ -O3 -DNDEBUG sparse_01.cpp -I .. -o sparse_01 && ./sparse_01
-
-#include <Eigen/Array>
-#include <Eigen/Sparse>
-#include <bench/BenchTimer.h>
-
-#include "gmm/gmm.h"
-
-using namespace std;
-using namespace Eigen;
-USING_PART_OF_NAMESPACE_EIGEN
-
-#ifndef REPEAT
-#define REPEAT 10
-#endif
-
-#define REPEATPRODUCT 1
-
-#define SIZE 10
-#define DENSITY 0.2
-
-// #define NODENSEMATRIX
-
-typedef MatrixXf DenseMatrix;
-// typedef Matrix<float,SIZE,SIZE> DenseMatrix;
-typedef SparseMatrix<float> EigenSparseMatrix;
-typedef gmm::csc_matrix<float> GmmSparse;
-typedef gmm::col_matrix< gmm::wsvector<float> > GmmDynSparse;
-
-void fillMatrix(float density, int rows, int cols, DenseMatrix* pDenseMatrix, EigenSparseMatrix* pSparseMatrix, GmmSparse* pGmmMatrix=0)
-{
-  GmmDynSparse gmmT(rows, cols);
-  if (pSparseMatrix)
-    pSparseMatrix->startFill(rows*cols*density);
-  for(int j = 0; j < cols; j++)
-  {
-    for(int i = 0; i < rows; i++)
-    {
-      float v = (ei_random<float>(0,1) < density) ? ei_random<float>() : 0;
-      if (pDenseMatrix)
-        (*pDenseMatrix)(i,j) = v;
-      if (v!=0)
-      {
-        if (pSparseMatrix)
-          pSparseMatrix->fill(i,j) = v;
-        if (pGmmMatrix)
-          gmmT(i,j) = v;
-      }
-    }
-  }
-  if (pSparseMatrix)
-    pSparseMatrix->endFill();
-  if (pGmmMatrix)
-    gmm::copy(gmmT, *pGmmMatrix);
-}
-
-int main(int argc, char *argv[])
-{
-  int rows = SIZE;
-  int cols = SIZE;
-  float density = DENSITY;
-
-  // dense matrices
-  #ifndef NODENSEMATRIX
-  DenseMatrix m1(rows,cols), m2(rows,cols), m3(rows,cols), m4(rows,cols);
-  #endif
-
-  // sparse matrices
-  EigenSparseMatrix sm1(rows,cols), sm2(rows,cols), sm3(rows,cols), sm4(rows,cols);
-
-  HashMatrix<float> hm4(rows,cols);
-
-  // GMM++ matrices
-
-  GmmDynSparse  gmmT4(rows,cols);
-  GmmSparse gmmM1(rows,cols), gmmM2(rows,cols), gmmM3(rows,cols), gmmM4(rows,cols);
-
-  #ifndef NODENSEMATRIX
-  fillMatrix(density, rows, cols, &m1, &sm1, &gmmM1);
-  fillMatrix(density, rows, cols, &m2, &sm2, &gmmM2);
-  fillMatrix(density, rows, cols, &m3, &sm3, &gmmM3);
-  #else
-  fillMatrix(density, rows, cols, 0, &sm1, &gmmM1);
-  fillMatrix(density, rows, cols, 0, &sm2, &gmmM2);
-  fillMatrix(density, rows, cols, 0, &sm3, &gmmM3);
-  #endif
-
-  BenchTimer timer;
-
-  //--------------------------------------------------------------------------------
-  //  COEFF WISE OPERATORS
-  //--------------------------------------------------------------------------------
-#if 1
-  std::cout << "\n\n\"m4 = m1 + m2 + 2 * m3\":\n\n";
-
-  timer.reset();
-  timer.start();
-  asm("#begin");
-  for (int k=0; k<REPEAT; ++k)
-    m4 = m1 + m2 + 2 * m3;
-  asm("#end");
-  timer.stop();
-  std::cout << "Eigen dense = " << timer.value() << endl;
-
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEAT; ++k)
-    sm4 = sm1 + sm2 + 2 * sm3;
-  timer.stop();
-  std::cout << "Eigen sparse = " << timer.value() << endl;
-
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEAT; ++k)
-    hm4 = sm1 + sm2 + 2 * sm3;
-  timer.stop();
-  std::cout << "Eigen hash = " << timer.value() << endl;
-
-  LinkedVectorMatrix<float> lm4(rows, cols);
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEAT; ++k)
-    lm4 = sm1 + sm2 + 2 * sm3;
-  timer.stop();
-  std::cout << "Eigen linked vector = " << timer.value() << endl;
-
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEAT; ++k)
-  {
-    gmm::add(gmmM1, gmmM2, gmmT4);
-    gmm::add(gmm::scaled(gmmM3,2), gmmT4);
-  }
-  timer.stop();
-  std::cout << "GMM++ sparse = " << timer.value() << endl;
-#endif
-  //--------------------------------------------------------------------------------
-  //  PRODUCT
-  //--------------------------------------------------------------------------------
-#if 0
-  std::cout << "\n\nProduct:\n\n";
-
-  #ifndef NODENSEMATRIX
-  timer.reset();
-  timer.start();
-  asm("#begin");
-  for (int k=0; k<REPEATPRODUCT; ++k)
-    m1 = m1 * m2;
-  asm("#end");
-  timer.stop();
-  std::cout << "Eigen dense = " << timer.value() << endl;
-  #endif
-
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEATPRODUCT; ++k)
-    sm4 = sm1 * sm2;
-  timer.stop();
-  std::cout << "Eigen sparse = " << timer.value() << endl;
-
-//   timer.reset();
-//   timer.start();
-//   for (int k=0; k<REPEATPRODUCT; ++k)
-//     hm4 = sm1 * sm2;
-//   timer.stop();
-//   std::cout << "Eigen hash = " << timer.value() << endl;
-
-  timer.reset();
-  timer.start();
-  for (int k=0; k<REPEATPRODUCT; ++k)
-  {
-    gmm::csr_matrix<float> R(rows,cols);
-    gmm::copy(gmmM1, R);
-    //gmm::mult(gmmM1, gmmM2, gmmT4);
-  }
-  timer.stop();
-  std::cout << "GMM++ sparse = " << timer.value() << endl;
-#endif
-  //--------------------------------------------------------------------------------
-  //  VARIOUS
-  //--------------------------------------------------------------------------------
-#if 1
-//   sm3 = sm1 + m2;
-  cout << m4.transpose() << "\n\n";
-//   sm4 = sm1+sm2;
-  cout << sm4 << "\n\n";
-  cout << lm4 << endl;
-
-  LinkedVectorMatrix<float,RowMajorBit> lm5(rows, cols);
-  lm5 = lm4;
-  lm5 = sm4;
-  cout << endl << lm5 << endl;
-
-  sm3 = sm4.transpose();
-  cout << endl << lm5 << endl;
-
-  cout << endl << "SM1 before random editing: " << endl << sm1 << endl;
-  {
-    SparseSetter<EigenSparseMatrix,RandomAccessPattern> w1(sm1);
-    w1->coeffRef(4,2) = ei_random<float>();
-    w1->coeffRef(2,6) = ei_random<float>();
-    w1->coeffRef(0,4) = ei_random<float>();
-    w1->coeffRef(9,3) = ei_random<float>();
-  }
-  cout << endl << "SM1 after random editing: " << endl << sm1 << endl;
-#endif
-
-  return 0;
-}
-
--- a/bench/sparse_product.cpp
+++ b/bench/sparse_product.cpp
@ -0,0 +1,199 @@
+
+//g++ -O3 -g0 -DNDEBUG  sparse_product.cpp -I.. -I/home/gael/Coding/LinearAlgebra/mtl4/ -DDENSITY=0.005 -DSIZE=10000 && ./a.out
+//g++ -O3 -g0 -DNDEBUG  sparse_product.cpp -I.. -I/home/gael/Coding/LinearAlgebra/mtl4/ -DDENSITY=0.05 -DSIZE=2000 && ./a.out
+// -DNOGMM -DNOMTL
+
+#ifndef SIZE
+#define SIZE 10000
+#endif
+
+#ifndef DENSITY
+#define DENSITY 0.01
+#endif
+
+#ifndef REPEAT
+#define REPEAT 1
+#endif
+
+#include "BenchSparseUtil.h"
+
+#ifndef MINDENSITY
+#define MINDENSITY 0.0004
+#endif
+
+int main(int argc, char *argv[])
+{
+  int rows = SIZE;
+  int cols = SIZE;
+  float density = DENSITY;
+
+  EigenSparseMatrix sm1(rows,cols), sm2(rows,cols), sm3(rows,cols), sm4(rows,cols);
+
+  BenchTimer timer;
+  for (float density = DENSITY; density>=MINDENSITY; density*=0.5)
+  {
+    fillMatrix(density, rows, cols, sm1);
+    fillMatrix(density, rows, cols, sm2);
+
+    // dense matrices
+    #ifdef DENSEMATRIX
+    {
+      std::cout << "Eigen Dense\t" << density*100 << "%\n";
+      DenseMatrix m1(rows,cols), m2(rows,cols), m3(rows,cols);
+      eiToDense(sm1, m1);
+      eiToDense(sm2, m2);
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1 * m2;
+      timer.stop();
+      std::cout << "   a * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1.transpose() * m2;
+      timer.stop();
+      std::cout << "   a' * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1.transpose() * m2.transpose();
+      timer.stop();
+      std::cout << "   a' * b':\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1 * m2.transpose();
+      timer.stop();
+      std::cout << "   a * b':\t" << timer.value() << endl;
+    }
+    #endif
+
+    // eigen sparse matrices
+    {
+      std::cout << "Eigen sparse\t" << density*100 << "%\n";
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        sm3 = sm1 * sm2;
+      timer.stop();
+      std::cout << "   a * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        sm3 = sm1.transpose() * sm2;
+      timer.stop();
+      std::cout << "   a' * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        sm3 = sm1.transpose() * sm2.transpose();
+      timer.stop();
+      std::cout << "   a' * b':\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        sm3 = sm1 * sm2.transpose();
+      timer.stop();
+      std::cout << "   a * b' :\t" << timer.value() << endl;
+    }
+
+    // GMM++
+    #ifndef NOGMM
+    {
+      std::cout << "GMM++ sparse\t" << density*100 << "%\n";
+      GmmDynSparse  gmmT3(rows,cols);
+      GmmSparse m1(rows,cols), m2(rows,cols), m3(rows,cols);
+      eiToGmm(sm1, m1);
+      eiToGmm(sm2, m2);
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        gmm::mult(m1, m2, gmmT3);
+      timer.stop();
+      std::cout << "   a * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        gmm::mult(gmm::transposed(m1), m2, gmmT3);
+      timer.stop();
+      std::cout << "   a' * b:\t" << timer.value() << endl;
+
+      if (rows<500)
+      {
+        timer.reset();
+        timer.start();
+        for (int k=0; k<REPEAT; ++k)
+          gmm::mult(gmm::transposed(m1), gmm::transposed(m2), gmmT3);
+        timer.stop();
+        std::cout << "   a' * b':\t" << timer.value() << endl;
+
+        timer.reset();
+        timer.start();
+        for (int k=0; k<REPEAT; ++k)
+          gmm::mult(m1, gmm::transposed(m2), gmmT3);
+        timer.stop();
+        std::cout << "   a * b':\t" << timer.value() << endl;
+      }
+      else
+      {
+        std::cout << "   a' * b':\t" << "forever" << endl;
+        std::cout << "   a * b':\t" << "forever" << endl;
+      }
+    }
+    #endif
+
+    // MTL4
+    #ifndef NOMTL
+    {
+      std::cout << "MTL4\t" << density*100 << "%\n";
+      MtlSparse m1(rows,cols), m2(rows,cols), m3(rows,cols);
+      eiToMtl(sm1, m1);
+      eiToMtl(sm2, m2);
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1 * m2;
+      timer.stop();
+      std::cout << "   a * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = trans(m1) * m2;
+      timer.stop();
+      std::cout << "   a' * b:\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = trans(m1) * trans(m2);
+      timer.stop();
+      std::cout << "  a' * b':\t" << timer.value() << endl;
+
+      timer.reset();
+      timer.start();
+      for (int k=0; k<REPEAT; ++k)
+        m3 = m1 * trans(m2);
+      timer.stop();
+      std::cout << "   a * b' :\t" << timer.value() << endl;
+    }
+    #endif
+
+    std::cout << "\n\n";
+  }
+
+  return 0;
+}
+
--- a/test/cholesky.cpp
+++ b/test/cholesky.cpp
@ -24,6 +24,7 @@

 #include "main.h"
 #include <Eigen/Cholesky>
+#include <Eigen/LU>

 template<typename MatrixType> void cholesky(const MatrixType& m)
 {
@ -34,12 +35,12 @@ template<typename MatrixType> void cholesky(const MatrixType& m)
  int cols = m.cols();

  typedef typename MatrixType::Scalar Scalar;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> SquareMatrixType;
-  typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MatrixType::RowsAtCompileTime> SquareMatrixType;
+  typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;

-  MatrixType a = MatrixType::random(rows,cols).transpose();
-  VectorType b = VectorType::random(cols);
-  SquareMatrixType covMat =  a.adjoint() * a;
+  MatrixType a = MatrixType::random(rows,cols);
+  VectorType b = VectorType::random(rows);
+  SquareMatrixType covMat =  a * a.adjoint();

  CholeskyWithoutSquareRoot<SquareMatrixType> cholnosqrt(covMat);
  VERIFY_IS_APPROX(covMat, cholnosqrt.matrixL() * cholnosqrt.vectorD().asDiagonal() * cholnosqrt.matrixL().adjoint());