From 51c991af459b86aab4a0a9575edba378e20e8621 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Thu, 12 Feb 2009 15:18:59 +0000
Subject: [PATCH] * exit Sum.h, exit Prod.h, welcome vectorization of redux() !
 * add vectorization for minCoeff and maxCoeff

---
 Eigen/Core                               |   2 -
 Eigen/src/Core/Functors.h                |  16 +-
 Eigen/src/Core/GenericPacketMath.h       |   8 +
 Eigen/src/Core/Prod.h                    | 262 ----------------------
 Eigen/src/Core/Redux.h                   | 258 ++++++++++++++++++---
 Eigen/src/Core/Sum.h                     | 272 -----------------------
 Eigen/src/Core/arch/AltiVec/PacketMath.h |  71 ++++--
 Eigen/src/Core/arch/SSE/PacketMath.h     |  56 ++++-
 test/CMakeLists.txt                      |   3 +-
 test/packetmath.cpp                      |  10 +
 test/prod.cpp                            |  86 -------
 test/redux.cpp                           | 127 +++++++++++
 test/sum.cpp                             |  86 -------
 13 files changed, 497 insertions(+), 760 deletions(-)
 delete mode 100644 Eigen/src/Core/Prod.h
 delete mode 100644 Eigen/src/Core/Sum.h
 delete mode 100644 test/prod.cpp
 create mode 100644 test/redux.cpp
 delete mode 100644 test/sum.cpp
diff --git a/Eigen/Core b/Eigen/Core
index ef9ff9a91..b99f49a01 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -138,8 +138,6 @@ namespace Eigen {
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/DiagonalCoeffs.h"
-#include "src/Core/Sum.h"
-#include "src/Core/Prod.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h
index 8b9516ae6..57a055662 100644
--- a/Eigen/src/Core/Functors.h
+++ b/Eigen/src/Core/Functors.h
@@ -37,6 +37,9 @@ template<typename Scalar> struct ei_scalar_sum_op EIGEN_EMPTY_STRUCT {
   template<typename PacketScalar>
   EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
   { return ei_padd(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_sum_op<Scalar> > {
@@ -56,6 +59,9 @@ template<typename Scalar> struct ei_scalar_product_op EIGEN_EMPTY_STRUCT {
   template<typename PacketScalar>
   EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
   { return ei_pmul(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_mul(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_product_op<Scalar> > {
@@ -75,6 +81,9 @@ template<typename Scalar> struct ei_scalar_min_op EIGEN_EMPTY_STRUCT {
   template<typename PacketScalar>
   EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
   { return ei_pmin(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_min(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_min_op<Scalar> > {
@@ -94,6 +103,9 @@ template<typename Scalar> struct ei_scalar_max_op EIGEN_EMPTY_STRUCT {
   template<typename PacketScalar>
   EIGEN_STRONG_INLINE const PacketScalar packetOp(const PacketScalar& a, const PacketScalar& b) const
   { return ei_pmax(a,b); }
+  template<typename PacketScalar>
+  EIGEN_STRONG_INLINE const Scalar predux(const PacketScalar& a) const
+  { return ei_predux_max(a); }
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_max_op<Scalar> > {
@@ -123,7 +135,7 @@ template<typename Scalar> struct ei_scalar_hypot_op EIGEN_EMPTY_STRUCT {
 };
 template<typename Scalar>
 struct ei_functor_traits<ei_scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost };
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
 };
 
 // other binary functors:
@@ -197,7 +209,7 @@ struct ei_functor_traits<ei_scalar_abs_op<Scalar> >
 {
   enum {
     Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = false // this could actually be vectorized with SSSE3.
+    PacketAccess = false // FIXME this could actually be vectorized with SSSE3.
   };
 };
 
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 84326b3a7..10d39b68b 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -100,6 +100,14 @@ template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_pr
 template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_mul(const Packet& a)
 { return a; }
 
+/** \internal \returns the min of the elements of \a a*/
+template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_min(const Packet& a)
+{ return a; }
+
+/** \internal \returns the max of the elements of \a a*/
+template<typename Packet> inline typename ei_unpacket_traits<Packet>::type ei_predux_max(const Packet& a)
+{ return a; }
+
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> inline Packet ei_preverse(const Packet& a)
 { return a; }
diff --git a/Eigen/src/Core/Prod.h b/Eigen/src/Core/Prod.h
deleted file mode 100644
index 8ad22f579..000000000
--- a/Eigen/src/Core/Prod.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#ifndef EIGEN_PROD_H
-#define EIGEN_PROD_H
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for vectorization and unrolling
-***************************************************************************/
-
-template<typename Derived>
-struct ei_prod_traits
-{
-private:
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
-  };
-
-public:
-  enum {
-    Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
-                 && (int(Derived::Flags)&LinearAccessBit)
-                  ? LinearVectorization
-                  : NoVectorization
-  };
-
-private:
-  enum {
-    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
-           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::MulCost,
-    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
-  };
-
-public:
-  enum {
-    Unrolling = Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
-  };
-};
-
-/***************************************************************************
-* Part 2 : unrollers
-***************************************************************************/
-
-/*** no vectorization ***/
-
-template<typename Derived, int Start, int Length>
-struct ei_prod_novec_unroller
-{
-  enum {
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return ei_prod_novec_unroller<Derived, Start, HalfLength>::run(mat)
-         * ei_prod_novec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat);
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_prod_novec_unroller<Derived, Start, 1>
-{
-  enum {
-    col = Start / Derived::RowsAtCompileTime,
-    row = Start % Derived::RowsAtCompileTime
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return mat.coeff(row, col);
-  }
-};
-
-/*** vectorization ***/
-  
-template<typename Derived, int Start, int Length>
-struct ei_prod_vec_unroller
-{
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return ei_pmul(
-            ei_prod_vec_unroller<Derived, Start, HalfLength>::run(mat),
-            ei_prod_vec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat) );
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_prod_vec_unroller<Derived, Start, 1>
-{
-  enum {
-    index = Start * ei_packet_traits<typename Derived::Scalar>::size,
-    row = int(Derived::Flags)&RowMajorBit
-        ? index / int(Derived::ColsAtCompileTime)
-        : index % Derived::RowsAtCompileTime,
-    col = int(Derived::Flags)&RowMajorBit
-        ? index % int(Derived::ColsAtCompileTime)
-        : index / Derived::RowsAtCompileTime,
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return mat.template packet<alignment>(row, col);
-  }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived,
-         int Vectorization = ei_prod_traits<Derived>::Vectorization,
-         int Unrolling = ei_prod_traits<Derived>::Unrolling
->
-struct ei_prod_impl;
-
-template<typename Derived>
-struct ei_prod_impl<Derived, NoVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  static Scalar run(const Derived& mat)
-  {
-    ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
-    Scalar res;
-    res = mat.coeff(0, 0);
-    for(int i = 1; i < mat.rows(); ++i)
-      res *= mat.coeff(i, 0);
-    for(int j = 1; j < mat.cols(); ++j)
-      for(int i = 0; i < mat.rows(); ++i)
-        res *= mat.coeff(i, j);
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_prod_impl<Derived, NoVectorization, CompleteUnrolling>
-  : public ei_prod_novec_unroller<Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Derived>
-struct ei_prod_impl<Derived, LinearVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  static Scalar run(const Derived& mat)
-  {
-    const int size = mat.size();
-    const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedStart =  (Derived::Flags & AlignedBit)
-                           || !(Derived::Flags & DirectAccessBit)
-                           ? 0
-                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
-    enum {
-      alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
-    };
-    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
-    const int alignedEnd = alignedStart + alignedSize;
-    Scalar res;
-
-    if(alignedSize)
-    {
-      PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
-      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
-        packet_res = ei_pmul(packet_res, mat.template packet<alignment>(index));
-      res = ei_predux_mul(packet_res);
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
-    {
-      res = Scalar(1);
-    }
-
-    for(int index = 0; index < alignedStart; ++index)
-      res *= mat.coeff(index);
-
-    for(int index = alignedEnd; index < size; ++index)
-      res *= mat.coeff(index);
-
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_prod_impl<Derived, LinearVectorization, CompleteUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-  enum {
-    PacketSize = ei_packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
-    VectorizationSize = (Size / PacketSize) * PacketSize
-  };
-  static Scalar run(const Derived& mat)
-  {
-    Scalar res = ei_predux_mul(ei_prod_vec_unroller<Derived, 0, Size / PacketSize>::run(mat));
-    if (VectorizationSize != Size)
-      res *= ei_prod_novec_unroller<Derived, VectorizationSize, Size-VectorizationSize>::run(mat);
-    return res;
-  }
-};
-
-/***************************************************************************
-* Part 4 : implementation of MatrixBase methods
-***************************************************************************/
-
-/** \returns the product of all coefficients of *this
-  *
-  * Example: \include MatrixBase_prod.cpp
-  * Output: \verbinclude MatrixBase_prod.out
-  *
-  * \sa sum()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::prod() const
-{
-  typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
-  return ei_prod_impl<ThisNested>::run(derived());
-}
-
-#endif // EIGEN_PROD_H
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 5b5907073..889a12d30 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -26,48 +26,147 @@
 #ifndef EIGEN_REDUX_H
 #define EIGEN_REDUX_H
 
-template<typename BinaryOp, typename Derived, int Start, int Length>
-struct ei_redux_impl
+// TODO
+//  * implement other kind of vectorization
+//  * factorize code
+
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for vectorization and unrolling
+***************************************************************************/
+
+template<typename Func, typename Derived>
+struct ei_redux_traits
+{
+private:
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
+  };
+
+public:
+  enum {
+    Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
+                 && (int(Derived::Flags)&LinearAccessBit)
+                 && (ei_functor_traits<Func>::PacketAccess)
+                  ? LinearVectorization
+                  : NoVectorization
+  };
+
+private:
+  enum {
+    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
+           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
+    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
+  };
+
+public:
+  enum {
+    Unrolling = Cost <= UnrollingLimit
+              ? CompleteUnrolling
+              : NoUnrolling
+  };
+};
+
+/***************************************************************************
+* Part 2 : unrollers
+***************************************************************************/
+
+/*** no vectorization ***/
+
+template<typename Func, typename Derived, int Start, int Length>
+struct ei_redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
+  typedef typename Derived::Scalar Scalar;
 
-  static Scalar run(const Derived &mat, const BinaryOp& func)
+  EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func& func)
   {
-    return func(
-      ei_redux_impl<BinaryOp, Derived, Start, HalfLength>::run(mat, func),
-      ei_redux_impl<BinaryOp, Derived, Start+HalfLength, Length - HalfLength>::run(mat, func));
+    return func(ei_redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+                ei_redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
   }
 };
 
-template<typename BinaryOp, typename Derived, int Start>
-struct ei_redux_impl<BinaryOp, Derived, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct ei_redux_novec_unroller<Func, Derived, Start, 1>
 {
   enum {
     col = Start / Derived::RowsAtCompileTime,
     row = Start % Derived::RowsAtCompileTime
   };
 
-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
+  typedef typename Derived::Scalar Scalar;
 
-  static Scalar run(const Derived &mat, const BinaryOp &)
+  EIGEN_STRONG_INLINE static Scalar run(const Derived &mat, const Func&)
   {
     return mat.coeff(row, col);
   }
 };
 
-template<typename BinaryOp, typename Derived, int Start>
-struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
+/*** vectorization ***/
+  
+template<typename Func, typename Derived, int Start, int Length>
+struct ei_redux_vec_unroller
 {
-  typedef typename ei_result_of<BinaryOp(typename Derived::Scalar)>::type Scalar;
-  static Scalar run(const Derived& mat, const BinaryOp& func)
+  enum {
+    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
+    HalfLength = Length/2
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func& func)
+  {
+    return func.packetOp(
+            ei_redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            ei_redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+  }
+};
+
+template<typename Func, typename Derived, int Start>
+struct ei_redux_vec_unroller<Func, Derived, Start, 1>
+{
+  enum {
+    index = Start * ei_packet_traits<typename Derived::Scalar>::size,
+    row = int(Derived::Flags)&RowMajorBit
+        ? index / int(Derived::ColsAtCompileTime)
+        : index % Derived::RowsAtCompileTime,
+    col = int(Derived::Flags)&RowMajorBit
+        ? index % int(Derived::ColsAtCompileTime)
+        : index / Derived::RowsAtCompileTime,
+    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  EIGEN_STRONG_INLINE static PacketScalar run(const Derived &mat, const Func&)
+  {
+    return mat.template packet<alignment>(row, col);
+  }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+template<typename Func, typename Derived,
+         int Vectorization = ei_redux_traits<Func, Derived>::Vectorization,
+         int Unrolling = ei_redux_traits<Func, Derived>::Unrolling
+>
+struct ei_redux_impl;
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, NoVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  static Scalar run(const Derived& mat, const Func& func)
   {
     ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
     Scalar res;
-    res = mat.coeff(0,0);
+    res = mat.coeff(0, 0);
     for(int i = 1; i < mat.rows(); ++i)
       res = func(res, mat.coeff(i, 0));
     for(int j = 1; j < mat.cols(); ++j)
@@ -77,6 +176,77 @@ struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
   }
 };
 
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func,Derived, NoVectorization, CompleteUnrolling>
+  : public ei_redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+{};
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, LinearVectorization, NoUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+
+  static Scalar run(const Derived& mat, const Func& func)
+  {
+    const int size = mat.size();
+    const int packetSize = ei_packet_traits<Scalar>::size;
+    const int alignedStart =  (Derived::Flags & AlignedBit)
+                           || !(Derived::Flags & DirectAccessBit)
+                           ? 0
+                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
+    enum {
+      alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
+                ? Aligned : Unaligned
+    };
+    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
+    const int alignedEnd = alignedStart + alignedSize;
+    Scalar res;
+    if(alignedSize)
+    {
+      PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
+      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
+        packet_res = func.packetOp(packet_res, mat.template packet<alignment>(index));
+      res = func.predux(packet_res);
+      
+      for(int index = 0; index < alignedStart; ++index)
+        res = func(res,mat.coeff(index));
+
+      for(int index = alignedEnd; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+    else // too small to vectorize anything.
+         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+    {
+      res = mat.coeff(0);
+      for(int index = 1; index < size; ++index)
+        res = func(res,mat.coeff(index));
+    }
+
+    return res;
+  }
+};
+
+template<typename Func, typename Derived>
+struct ei_redux_impl<Func, Derived, LinearVectorization, CompleteUnrolling>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
+  enum {
+    PacketSize = ei_packet_traits<Scalar>::size,
+    Size = Derived::SizeAtCompileTime,
+    VectorizationSize = (Size / PacketSize) * PacketSize
+  };
+  EIGEN_STRONG_INLINE static Scalar run(const Derived& mat, const Func& func)
+  {
+    Scalar res = func.predux(ei_redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+    if (VectorizationSize != Size)
+      res = func(res,ei_redux_novec_unroller<Func, Derived, VectorizationSize, Size-VectorizationSize>::run(mat,func));
+    return res;
+  }
+};
+
+
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
   *
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
@@ -85,22 +255,20 @@ struct ei_redux_impl<BinaryOp, Derived, Start, Dynamic>
   * \sa MatrixBase::sum(), MatrixBase::minCoeff(), MatrixBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
-template<typename BinaryOp>
-typename ei_result_of<BinaryOp(typename ei_traits<Derived>::Scalar)>::type
-MatrixBase<Derived>::redux(const BinaryOp& func) const
+template<typename Func>
+inline typename ei_result_of<Func(typename ei_traits<Derived>::Scalar)>::type
+MatrixBase<Derived>::redux(const Func& func) const
 {
-  const bool unroll = SizeAtCompileTime * CoeffReadCost
-                    + (SizeAtCompileTime-1) * ei_functor_traits<BinaryOp>::Cost
-                    <= EIGEN_UNROLLING_LIMIT;
+  typename Derived::Nested nested(derived());
   typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
-  return ei_redux_impl<BinaryOp, ThisNested, 0, unroll ? int(SizeAtCompileTime) : Dynamic>
-            ::run(derived(), func);
+  return ei_redux_impl<Func, ThisNested>
+            ::run(nested, func);
 }
 
 /** \returns the minimum of all coefficients of *this
   */
 template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
 MatrixBase<Derived>::minCoeff() const
 {
   return this->redux(Eigen::ei_scalar_min_op<Scalar>());
@@ -109,10 +277,48 @@ MatrixBase<Derived>::minCoeff() const
 /** \returns the maximum of all coefficients of *this
   */
 template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
 MatrixBase<Derived>::maxCoeff() const
 {
   return this->redux(Eigen::ei_scalar_max_op<Scalar>());
 }
 
+/** \returns the sum of all coefficients of *this
+  *
+  * \sa trace(), prod()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::sum() const
+{
+  return this->redux(Eigen::ei_scalar_sum_op<Scalar>());
+}
+
+/** \returns the product of all coefficients of *this
+  *
+  * Example: \include MatrixBase_prod.cpp
+  * Output: \verbinclude MatrixBase_prod.out
+  *
+  * \sa sum()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::prod() const
+{
+  return this->redux(Eigen::ei_scalar_product_op<Scalar>());
+}
+
+/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
+  *
+  * \c *this can be any matrix, not necessarily square.
+  *
+  * \sa diagonal(), sum()
+  */
+template<typename Derived>
+EIGEN_STRONG_INLINE typename ei_traits<Derived>::Scalar
+MatrixBase<Derived>::trace() const
+{
+  return diagonal().sum();
+}
+
 #endif // EIGEN_REDUX_H
diff --git a/Eigen/src/Core/Sum.h b/Eigen/src/Core/Sum.h
deleted file mode 100644
index 61ff7e4a6..000000000
--- a/Eigen/src/Core/Sum.h
+++ /dev/null
@@ -1,272 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#ifndef EIGEN_SUM_H
-#define EIGEN_SUM_H
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for vectorization and unrolling
-***************************************************************************/
-
-template<typename Derived>
-struct ei_sum_traits
-{
-private:
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size
-  };
-
-public:
-  enum {
-    Vectorization = (int(Derived::Flags)&ActualPacketAccessBit)
-                 && (int(Derived::Flags)&LinearAccessBit)
-                  ? LinearVectorization
-                  : NoVectorization
-  };
-
-private:
-  enum {
-    Cost = Derived::SizeAtCompileTime * Derived::CoeffReadCost
-           + (Derived::SizeAtCompileTime-1) * NumTraits<typename Derived::Scalar>::AddCost,
-    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Vectorization) == int(NoVectorization) ? 1 : int(PacketSize))
-  };
-
-public:
-  enum {
-    Unrolling = Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
-  };
-};
-
-/***************************************************************************
-* Part 2 : unrollers
-***************************************************************************/
-
-/*** no vectorization ***/
-
-template<typename Derived, int Start, int Length>
-struct ei_sum_novec_unroller
-{
-  enum {
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return ei_sum_novec_unroller<Derived, Start, HalfLength>::run(mat)
-         + ei_sum_novec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat);
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_sum_novec_unroller<Derived, Start, 1>
-{
-  enum {
-    col = Start / Derived::RowsAtCompileTime,
-    row = Start % Derived::RowsAtCompileTime
-  };
-
-  typedef typename Derived::Scalar Scalar;
-
-  inline static Scalar run(const Derived &mat)
-  {
-    return mat.coeff(row, col);
-  }
-};
-
-/*** vectorization ***/
-  
-template<typename Derived, int Start, int Length>
-struct ei_sum_vec_unroller
-{
-  enum {
-    PacketSize = ei_packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return ei_padd(
-            ei_sum_vec_unroller<Derived, Start, HalfLength>::run(mat),
-            ei_sum_vec_unroller<Derived, Start+HalfLength, Length-HalfLength>::run(mat) );
-  }
-};
-
-template<typename Derived, int Start>
-struct ei_sum_vec_unroller<Derived, Start, 1>
-{
-  enum {
-    index = Start * ei_packet_traits<typename Derived::Scalar>::size,
-    row = int(Derived::Flags)&RowMajorBit
-        ? index / int(Derived::ColsAtCompileTime)
-        : index % Derived::RowsAtCompileTime,
-    col = int(Derived::Flags)&RowMajorBit
-        ? index % int(Derived::ColsAtCompileTime)
-        : index / Derived::RowsAtCompileTime,
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
-  };
-
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  inline static PacketScalar run(const Derived &mat)
-  {
-    return mat.template packet<alignment>(row, col);
-  }
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived,
-         int Vectorization = ei_sum_traits<Derived>::Vectorization,
-         int Unrolling = ei_sum_traits<Derived>::Unrolling
->
-struct ei_sum_impl;
-
-template<typename Derived>
-struct ei_sum_impl<Derived, NoVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  static Scalar run(const Derived& mat)
-  {
-    ei_assert(mat.rows()>0 && mat.cols()>0 && "you are using a non initialized matrix");
-    Scalar res;
-    res = mat.coeff(0, 0);
-    for(int i = 1; i < mat.rows(); ++i)
-      res += mat.coeff(i, 0);
-    for(int j = 1; j < mat.cols(); ++j)
-      for(int i = 0; i < mat.rows(); ++i)
-        res += mat.coeff(i, j);
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, NoVectorization, CompleteUnrolling>
-  : public ei_sum_novec_unroller<Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, LinearVectorization, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-
-  static Scalar run(const Derived& mat)
-  {
-    const int size = mat.size();
-    const int packetSize = ei_packet_traits<Scalar>::size;
-    const int alignedStart =  (Derived::Flags & AlignedBit)
-                           || !(Derived::Flags & DirectAccessBit)
-                           ? 0
-                           : ei_alignmentOffset(&mat.const_cast_derived().coeffRef(0), size);
-    enum {
-      alignment = (Derived::Flags & DirectAccessBit) || (Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
-    };
-    const int alignedSize = ((size-alignedStart)/packetSize)*packetSize;
-    const int alignedEnd = alignedStart + alignedSize;
-    Scalar res;
-
-    if(alignedSize)
-    {
-      PacketScalar packet_res = mat.template packet<alignment>(alignedStart);
-      for(int index = alignedStart + packetSize; index < alignedEnd; index += packetSize)
-        packet_res = ei_padd(packet_res, mat.template packet<alignment>(index));
-      res = ei_predux(packet_res);
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
-    {
-      res = Scalar(0);
-    }
-
-    for(int index = 0; index < alignedStart; ++index)
-      res += mat.coeff(index);
-
-    for(int index = alignedEnd; index < size; ++index)
-      res += mat.coeff(index);
-
-    return res;
-  }
-};
-
-template<typename Derived>
-struct ei_sum_impl<Derived, LinearVectorization, CompleteUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename ei_packet_traits<Scalar>::type PacketScalar;
-  enum {
-    PacketSize = ei_packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
-    VectorizationSize = (Size / PacketSize) * PacketSize
-  };
-  static Scalar run(const Derived& mat)
-  {
-    Scalar res = ei_predux(ei_sum_vec_unroller<Derived, 0, Size / PacketSize>::run(mat));
-    if (VectorizationSize != Size)
-      res += ei_sum_novec_unroller<Derived, VectorizationSize, Size-VectorizationSize>::run(mat);
-    return res;
-  }
-};
-
-/***************************************************************************
-* Part 4 : implementation of MatrixBase methods
-***************************************************************************/
-
-/** \returns the sum of all coefficients of *this
-  *
-  * \sa trace(), prod()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::sum() const
-{
-  typedef typename ei_cleantype<typename Derived::Nested>::type ThisNested;
-  return ei_sum_impl<ThisNested>::run(derived());
-}
-
-/** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
-  *
-  * \c *this can be any matrix, not necessarily square.
-  *
-  * \sa diagonal(), sum()
-  */
-template<typename Derived>
-inline typename ei_traits<Derived>::Scalar
-MatrixBase<Derived>::trace() const
-{
-  return diagonal().sum();
-}
-
-#endif // EIGEN_SUM_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 37eafa81f..b90a6fae1 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -177,7 +177,7 @@ template<> inline v4f  ei_ploadu(const float*  from)
   return (v4f) vec_perm(MSQ, LSQ, mask);           // align the data
 }
 
-template<> inline v4i    ei_ploadu(const int*    from)
+template<> inline v4i  ei_ploadu(const int*    from)
 {
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   __vector unsigned char MSQ, LSQ;
@@ -198,7 +198,7 @@ template<> inline v4f  ei_pset1(const float&  from)
   return vc;
 }
 
-template<> inline v4i    ei_pset1(const int&    from)
+template<> inline v4i  ei_pset1(const int&    from)
 {
   int __attribute__(aligned(16)) ai[4];
   ai[0] = from;
@@ -248,26 +248,28 @@ template<> inline void ei_pstoreu(int*    to , const v4i&    from )
 
 template<> inline float  ei_pfirst(const v4f&  a)
 {
-  float __attribute__(aligned(16)) af[4];
+  float EIGEN_ALIGN_128 af[4];
   vec_st(a, 0, af);
   return af[0];
 }
 
 template<> inline int    ei_pfirst(const v4i&  a)
 {
-  int __attribute__(aligned(16)) ai[4];
+  int EIGEN_ALIGN_128 ai[4];
   vec_st(a, 0, ai);
   return ai[0];
 }
 
 template<> EIGEN_STRONG_INLINE v4f ei_preverse(const v4f& a)
 {
-  static const __vector unsigned char reverse_mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+  static const __vector unsigned char reverse_mask =
+    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
   return (v4f)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
 }
 template<> EIGEN_STRONG_INLINE v4i ei_preverse(const v4i& a)
 {
-  static const __vector unsigned char __attribute__(aligned(16)) reverse_mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+  static const __vector unsigned char __attribute__(aligned(16)) reverse_mask =
+    {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
   return (v4i)vec_perm((__vector unsigned char)a,(__vector unsigned char)a,reverse_mask);
 }
 
@@ -344,26 +346,59 @@ inline int ei_predux(const v4i& a)
   return ei_pfirst(sum);
 }
 
+// implement other reductions operators
 inline float ei_predux_mul(const v4f& a)
 {
-  v4f b, sum;
-  b = (v4f)vec_sld(a, a, 8);
-  sum = ei_pmul(a, b);
-  b = (v4f)vec_sld(sum, sum, 4);
-  sum = ei_pmul(sum, b);
-  return ei_pfirst(sum);
+  v4f prod;
+  prod = ei_pmul(a, (v4f)vec_sld(a, a, 8));
+  return ei_pfirst(ei_pmul(prod, (v4f)vec_sld(prod, prod, 4)));
 }
 
 inline int ei_predux_mul(const v4i& a)
 {
-  v4i b, sum;
-  b = (v4i)vec_sld(a, a, 8);
-  sum = ei_pmul(a, b);
-  b = (v4i)vec_sld(sum, sum, 4);
-  sum = ei_pmul(sum, b);
-  return ei_pfirst(sum);
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
+inline float ei_predux_min(const v4f& a)
+{
+  EIGEN_ALIGN_128 float aux[4];
+  ei_pstore(aux, a);
+  register float aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  register float aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+}
+
+inline int ei_predux_min(const v4i& a)
+{
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+}
+
+inline float ei_predux_max(const v4f& a)
+{
+  EIGEN_ALIGN_128 float aux[4];
+  ei_pstore(aux, a);
+  register float aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  register float aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
+}
+
+inline int ei_predux_max(const v4i& a)
+{
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
+}
+
+
+
 template<int Offset>
 struct ei_palign_impl<Offset, v4f>
 {
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 7bef79309..c2fbcd7cc 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -53,6 +53,7 @@ template<> EIGEN_STRONG_INLINE __m128  ei_pmul<__m128>(const __m128&  a, const _
 template<> EIGEN_STRONG_INLINE __m128d ei_pmul<__m128d>(const __m128d& a, const __m128d& b) { return _mm_mul_pd(a,b); }
 template<> EIGEN_STRONG_INLINE __m128i ei_pmul<__m128i>(const __m128i& a, const __m128i& b)
 {
+  // this version is very slightly faster than 4 scalar products
   return _mm_or_si128(
     _mm_and_si128(
       _mm_mul_epu32(a,b),
@@ -76,18 +77,18 @@ template<> EIGEN_STRONG_INLINE __m128i ei_pmadd(const __m128i& a, const __m128i&
 
 template<> EIGEN_STRONG_INLINE __m128  ei_pmin<__m128>(const __m128&  a, const __m128&  b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE __m128d ei_pmin<__m128d>(const __m128d& a, const __m128d& b) { return _mm_min_pd(a,b); }
-// FIXME this vectorized min operator is likely to be slower than the standard one
 template<> EIGEN_STRONG_INLINE __m128i ei_pmin<__m128i>(const __m128i& a, const __m128i& b)
 {
+  // after some bench, this version *is* faster than a scalar implementation
   __m128i mask = _mm_cmplt_epi32(a,b);
   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }
 
 template<> EIGEN_STRONG_INLINE __m128  ei_pmax<__m128>(const __m128&  a, const __m128&  b) { return _mm_max_ps(a,b); }
 template<> EIGEN_STRONG_INLINE __m128d ei_pmax<__m128d>(const __m128d& a, const __m128d& b) { return _mm_max_pd(a,b); }
-// FIXME this vectorized max operator is likely to be slower than the standard one
 template<> EIGEN_STRONG_INLINE __m128i ei_pmax<__m128i>(const __m128i& a, const __m128i& b)
 {
+  // after some bench, this version *is* faster than a scalar implementation
   __m128i mask = _mm_cmpgt_epi32(a,b);
   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
 }
@@ -216,6 +217,7 @@ template<> EIGEN_STRONG_INLINE __m128i ei_preduxp<__m128i>(const __m128i* vecs)
 
 // Other reduction functions:
 
+// mul
 template<> EIGEN_STRONG_INLINE float ei_predux_mul<__m128>(const __m128& a)
 {
   __m128 tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
@@ -227,8 +229,54 @@ template<> EIGEN_STRONG_INLINE double ei_predux_mul<__m128d>(const __m128d& a)
 }
 template<> EIGEN_STRONG_INLINE int ei_predux_mul<__m128i>(const __m128i& a)
 {
-  __m128i tmp = ei_pmul(a, _mm_unpackhi_epi64(a,a));
-  return ei_pfirst(tmp) * ei_pfirst(_mm_shuffle_epi32(tmp, 1));
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., reusing ei_pmul is very slow !)
+  // TODO try to call _mm_mul_epu32 directly
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+}
+
+// min
+template<> EIGEN_STRONG_INLINE float ei_predux_min<__m128>(const __m128& a)
+{
+  __m128 tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
+  return ei_pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double ei_predux_min<__m128d>(const __m128d& a)
+{
+  return ei_pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_min<__m128i>(const __m128i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the ei_pstore !!)
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  return aux0<aux2 ? aux0 : aux2;
+}
+
+// max
+template<> EIGEN_STRONG_INLINE float ei_predux_max<__m128>(const __m128& a)
+{
+  __m128 tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
+  return ei_pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+}
+template<> EIGEN_STRONG_INLINE double ei_predux_max<__m128d>(const __m128d& a)
+{
+  return ei_pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+}
+template<> EIGEN_STRONG_INLINE int ei_predux_max<__m128i>(const __m128i& a)
+{
+  // after some experiments, it is seems this is the fastest way to implement it
+  // for GCC (eg., it does not like using std::min after the ei_pstore !!)
+  EIGEN_ALIGN_128 int aux[4];
+  ei_pstore(aux, a);
+  register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  return aux0>aux2 ? aux0 : aux2;
 }
 
 #if (defined __GNUC__)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a20a6a12f..4046cf533 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -111,7 +111,7 @@ ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
 ei_add_test(linearstructure)
 ei_add_test(cwiseop)
-ei_add_test(sum)
+ei_add_test(redux)
 ei_add_test(product_small)
 ei_add_test(product_large ${EI_OFLAG})
 ei_add_test(adjoint)
@@ -149,7 +149,6 @@ ei_add_test(sparse_basic)
 ei_add_test(sparse_product)
 ei_add_test(sparse_solvers " " "${SPARSE_LIBS}")
 ei_add_test(reverse)
-ei_add_test(prod)
 
 # print a summary of the different options
 message("************************************************************")
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 0397d9b28..f56532f91 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -129,6 +129,16 @@ template<typename Scalar> void packetmath()
   for (int i=0; i<PacketSize; ++i)
     ref[0] *= data1[i];
   VERIFY(ei_isApprox(ref[0], ei_predux_mul(ei_pload(data1))) && "ei_predux_mul");
+  
+  ref[0] = data1[0];
+  for (int i=0; i<PacketSize; ++i)
+    ref[0] = std::min(ref[0],data1[i]);
+  VERIFY(ei_isApprox(ref[0], ei_predux_min(ei_pload(data1))) && "ei_predux_min");
+  
+  ref[0] = data1[0];
+  for (int i=0; i<PacketSize; ++i)
+    ref[0] = std::min(ref[0],data1[i]);
+  VERIFY(ei_isApprox(ref[0], ei_predux_min(ei_pload(data1))) && "ei_predux_max");
 
   for (int j=0; j<PacketSize; ++j)
   {
diff --git a/test/prod.cpp b/test/prod.cpp
deleted file mode 100644
index 12f8c0c48..000000000
--- a/test/prod.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#include "main.h"
-
-template<typename MatrixType> void matrixProd(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols);
-
-  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).prod(), Scalar(1));
-  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).prod(), Scalar(1));
-  Scalar x = Scalar(1);
-  for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) x *= m1(i,j);
-  VERIFY_IS_APPROX(m1.prod(), x);
-}
-
-template<typename VectorType> void vectorProd(const VectorType& w)
-{
-  typedef typename VectorType::Scalar Scalar;
-  int size = w.size();
-
-  VectorType v = VectorType::Random(size);
-  for(int i = 1; i < size; i++)
-  {
-    Scalar s = Scalar(1);
-    for(int j = 0; j < i; j++) s *= v[j];
-    VERIFY_IS_APPROX(s, v.start(i).prod());
-  }
-
-  for(int i = 0; i < size-1; i++)
-  {
-    Scalar s = Scalar(1);
-    for(int j = i; j < size; j++) s *= v[j];
-    VERIFY_IS_APPROX(s, v.end(size-i).prod());
-  }
-
-  for(int i = 0; i < size/2; i++)
-  {
-    Scalar s = Scalar(1);
-    for(int j = i; j < size-i; j++) s *= v[j];
-    VERIFY_IS_APPROX(s, v.segment(i, size-2*i).prod());
-  }
-}
-
-void test_prod()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( matrixProd(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST( matrixProd(Matrix2f()) );
-    CALL_SUBTEST( matrixProd(Matrix4d()) );
-    CALL_SUBTEST( matrixProd(MatrixXcf(3, 3)) );
-    CALL_SUBTEST( matrixProd(MatrixXf(8, 12)) );
-    CALL_SUBTEST( matrixProd(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( vectorProd(VectorXf(5)) );
-    CALL_SUBTEST( vectorProd(VectorXd(10)) );
-    CALL_SUBTEST( vectorProd(VectorXf(33)) );
-  }
-}
diff --git a/test/redux.cpp b/test/redux.cpp
new file mode 100644
index 000000000..dee9d4bbd
--- /dev/null
+++ b/test/redux.cpp
@@ -0,0 +1,127 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra. Eigen itself is part of the KDE project.
+//
+// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#include "main.h"
+
+template<typename MatrixType> void matrixRedux(const MatrixType& m)
+{
+  typedef typename MatrixType::Scalar Scalar;
+
+  int rows = m.rows();
+  int cols = m.cols();
+
+  MatrixType m1 = MatrixType::Random(rows, cols);
+
+  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
+  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), Scalar(float(rows*cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
+  Scalar s(0), p(1), minc(ei_real(m1.coeff(0))), maxc(ei_real(m1.coeff(0)));
+  for(int j = 0; j < cols; j++)
+  for(int i = 0; i < rows; i++)
+  {
+    s += m1(i,j);
+    p *= m1(i,j);
+    minc = std::min(ei_real(minc), ei_real(m1(i,j)));
+    maxc = std::max(ei_real(maxc), ei_real(m1(i,j)));
+  }
+  VERIFY_IS_APPROX(m1.sum(), s);
+  VERIFY_IS_APPROX(m1.prod(), p);
+  VERIFY_IS_APPROX(m1.real().minCoeff(), ei_real(minc));
+  VERIFY_IS_APPROX(m1.real().maxCoeff(), ei_real(maxc));
+}
+
+template<typename VectorType> void vectorRedux(const VectorType& w)
+{
+  typedef typename VectorType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  int size = w.size();
+
+  VectorType v = VectorType::Random(size);
+  for(int i = 1; i < size; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(ei_real(v.coeff(0))), maxc(ei_real(v.coeff(0)));
+    for(int j = 0; j < i; j++)
+    {
+      s += v[j];
+      p *= v[j];
+      minc = std::min(minc, ei_real(v[j]));
+      maxc = std::max(maxc, ei_real(v[j]));
+    }
+    VERIFY_IS_APPROX(s, v.start(i).sum());
+    VERIFY_IS_APPROX(p, v.start(i).prod());
+    VERIFY_IS_APPROX(minc, v.real().start(i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().start(i).maxCoeff());
+  }
+
+  for(int i = 0; i < size-1; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(ei_real(v.coeff(i))), maxc(ei_real(v.coeff(i)));
+    for(int j = i; j < size; j++)
+    {
+      s += v[j];
+      p *= v[j];
+      minc = std::min(minc, ei_real(v[j]));
+      maxc = std::max(maxc, ei_real(v[j]));
+    }
+    VERIFY_IS_APPROX(s, v.end(size-i).sum());
+    VERIFY_IS_APPROX(p, v.end(size-i).prod());
+    VERIFY_IS_APPROX(minc, v.real().end(size-i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().end(size-i).maxCoeff());
+  }
+
+  for(int i = 0; i < size/2; i++)
+  {
+    Scalar s(0), p(1);
+    RealScalar minc(ei_real(v.coeff(i))), maxc(ei_real(v.coeff(i)));
+    for(int j = i; j < size-i; j++)
+    {
+      s += v[j];
+      p *= v[j];
+      minc = std::min(minc, ei_real(v[j]));
+      maxc = std::max(maxc, ei_real(v[j]));
+    }
+    VERIFY_IS_APPROX(s, v.segment(i, size-2*i).sum());
+    VERIFY_IS_APPROX(p, v.segment(i, size-2*i).prod());
+    VERIFY_IS_APPROX(minc, v.real().segment(i, size-2*i).minCoeff());
+    VERIFY_IS_APPROX(maxc, v.real().segment(i, size-2*i).maxCoeff());
+  }
+}
+
+void test_redux()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST( matrixRedux(Matrix<float, 1, 1>()) );
+    CALL_SUBTEST( matrixRedux(Matrix2f()) );
+    CALL_SUBTEST( matrixRedux(Matrix4d()) );
+    CALL_SUBTEST( matrixRedux(MatrixXcf(3, 3)) );
+    CALL_SUBTEST( matrixRedux(MatrixXd(8, 12)) );
+    CALL_SUBTEST( matrixRedux(MatrixXi(8, 12)) );
+  }
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST( vectorRedux(VectorXf(5)) );
+    CALL_SUBTEST( vectorRedux(VectorXd(10)) );
+    CALL_SUBTEST( vectorRedux(VectorXf(33)) );
+  }
+}
diff --git a/test/sum.cpp b/test/sum.cpp
deleted file mode 100644
index fe707e9b2..000000000
--- a/test/sum.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. Eigen itself is part of the KDE project.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
-
-#include "main.h"
-
-template<typename MatrixType> void matrixSum(const MatrixType& m)
-{
-  typedef typename MatrixType::Scalar Scalar;
-
-  int rows = m.rows();
-  int cols = m.cols();
-
-  MatrixType m1 = MatrixType::Random(rows, cols);
-
-  VERIFY_IS_MUCH_SMALLER_THAN(MatrixType::Zero(rows, cols).sum(), Scalar(1));
-  VERIFY_IS_APPROX(MatrixType::Ones(rows, cols).sum(), Scalar(float(rows*cols))); // the float() here to shut up excessive MSVC warning about int->complex conversion being lossy
-  Scalar x = Scalar(0);
-  for(int i = 0; i < rows; i++) for(int j = 0; j < cols; j++) x += m1(i,j);
-  VERIFY_IS_APPROX(m1.sum(), x);
-}
-
-template<typename VectorType> void vectorSum(const VectorType& w)
-{
-  typedef typename VectorType::Scalar Scalar;
-  int size = w.size();
-
-  VectorType v = VectorType::Random(size);
-  for(int i = 1; i < size; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = 0; j < i; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.start(i).sum());
-  }
-
-  for(int i = 0; i < size-1; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = i; j < size; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.end(size-i).sum());
-  }
-
-  for(int i = 0; i < size/2; i++)
-  {
-    Scalar s = Scalar(0);
-    for(int j = i; j < size-i; j++) s += v[j];
-    VERIFY_IS_APPROX(s, v.segment(i, size-2*i).sum());
-  }
-}
-
-void test_sum()
-{
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( matrixSum(Matrix<float, 1, 1>()) );
-    CALL_SUBTEST( matrixSum(Matrix2f()) );
-    CALL_SUBTEST( matrixSum(Matrix4d()) );
-    CALL_SUBTEST( matrixSum(MatrixXcf(3, 3)) );
-    CALL_SUBTEST( matrixSum(MatrixXf(8, 12)) );
-    CALL_SUBTEST( matrixSum(MatrixXi(8, 12)) );
-  }
-  for(int i = 0; i < g_repeat; i++) {
-    CALL_SUBTEST( vectorSum(VectorXf(5)) );
-    CALL_SUBTEST( vectorSum(VectorXd(10)) );
-    CALL_SUBTEST( vectorSum(VectorXf(33)) );
-  }
-}