From fc3fd8ab57b334f37cbd512baf99e9b1850153f8 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 7 Jul 2010 18:10:11 +0200
Subject: [PATCH 01/17] mention that array = matrix is fine too

---
 doc/C03_TutorialArrayClass.dox | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/doc/C03_TutorialArrayClass.dox b/doc/C03_TutorialArrayClass.dox
index e82631878..0828bf39b 100644
--- a/doc/C03_TutorialArrayClass.dox
+++ b/doc/C03_TutorialArrayClass.dox
@@ -129,7 +129,7 @@ Mixing matrices and arrays in an expression is forbidden with Eigen. However,
 it is easy to convert from one to the other with \link MatrixBase::array() .array() \endlink and 
 \link ArrayBase::matrix() .matrix() \endlink.
 
-On the other hand, assigning a matrix expression to an array expression is allowed.
+On the other hand, assigning a matrix (resp. array) expression to an array (resp. matrix) expression is allowed.
 
 \subsection TutorialArrayClassInteropMatrix Matrix to array example
 The following example shows how to use array operations on a Matrix object by employing

From 2066ed91de31b118e20c40fdc74badac8d02dd22 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Wed, 7 Jul 2010 22:50:19 +0200
Subject: [PATCH 02/17] enabling aligned loads/store for complex<double> is
 much more tricky, so the temporary fix is to always perform unaligned
 load/store

---
 Eigen/src/Core/arch/SSE/Complex.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 4ecfc2f43..6c91386c6 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -261,12 +261,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd ei_por    <Packet1cd>(const Packet1cd&
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from)); }
+// FIXME force unaligned load, this is a temporary fix 
+template<> EIGEN_STRONG_INLINE Packet1cd ei_pload <std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_ploadu<std::complex<double> >(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ei_ploadu((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ei_pset1<std::complex<double> >(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ei_ploadu(&from); }
 
-template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v); }
+// FIXME force unaligned store, this is a temporary fix
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstoreu((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
@@ -274,7 +276,7 @@ template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<double> >(const std
 template<> EIGEN_STRONG_INLINE std::complex<double>  ei_pfirst<Packet1cd>(const Packet1cd& a)
 {
   EIGEN_ALIGN16 std::complex<double> res;
-  _mm_store_pd((double*)&res, a.v);
+  ei_pstore(&res, a);
   return res;
 }
 

From 9852e7b9cbb6cbfbd9769642355bdac52eb4b4d7 Mon Sep 17 00:00:00 2001
From: Carlos Becker <carlosbecker@gmail.com>
Date: Thu, 8 Jul 2010 17:42:23 +0100
Subject: [PATCH 03/17] Reductions/Broadcasting/Visitor Tutorial added

---
 ...TutorialReductionsVisitorsBroadcasting.dox | 199 ++++++++++++++++++
 ...ionsVisitorsBroadcasting_broadcast_1nn.cpp |  24 +++
 ...onsVisitorsBroadcasting_broadcast_1nn.cpp~ |  24 +++
 ...sVisitorsBroadcasting_broadcast_simple.cpp |  21 ++
 ...VisitorsBroadcasting_broadcast_simple.cpp~ |  21 ++
 ...sBroadcasting_broadcast_simple_rowwise.cpp |  20 ++
 ...ReductionsVisitorsBroadcasting_colwise.cpp |  13 ++
 ...ReductionsVisitorsBroadcasting_maxnorm.cpp |  19 ++
 ...ReductionsVisitorsBroadcasting_rowwise.cpp |  13 ++
 ...eductionsVisitorsBroadcasting_visitors.cpp |  26 +++
 ...ductionsVisitorsBroadcasting_visitors.cpp~ |  26 +++
 11 files changed, 406 insertions(+)
 create mode 100644 doc/C07_TutorialReductionsVisitorsBroadcasting.dox
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp~
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp~
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp~

diff --git a/doc/C07_TutorialReductionsVisitorsBroadcasting.dox b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
new file mode 100644
index 000000000..17124cb3b
--- /dev/null
+++ b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
@@ -0,0 +1,199 @@
+namespace Eigen {
+
+/** \page TutorialReductionsVisitorsBroadcasting Tutorial page 7 - Reductions, visitors and broadcasting
+    \ingroup Tutorial
+
+\li \b Previous: \ref TutorialAdvancedInitialization
+\li \b Next: \ref TutorialLinearAlgebra
+
+This tutorial explains Eigen's reductions, visitors and broadcasting and how they are used with
+\link MatrixBase matrices \endlink and \link ArrayBase arrays \endlink.
+
+\b Table \b of \b contents
+  - \ref TutorialReductionsVisitorsBroadcastingReductions
+    - FIXME: .redux()
+  - \ref TutorialReductionsVisitorsBroadcastingVisitors
+  - \ref TutorialReductionsVisitorsBroadcastingPartialReductions
+    - \ref TutorialReductionsVisitorsBroadcastingPartialReductionsCombined
+  - \ref TutorialReductionsVisitorsBroadcastingBroadcasting
+    - \ref TutorialReductionsVisitorsBroadcastingBroadcastingCombined
+
+
+\section TutorialReductionsVisitorsBroadcastingReductions Reductions
+In Eigen, a reduction is a function that is applied to a certain matrix or array, returning a single 
+value of type scalar. One of the most used reductions is \link MatrixBase::sum() .sum() \endlink,
+which returns the addition of all the coefficients inside a given matrix or array.
+
+<table class="tutorial_code"><tr><td>
+Example: \include tut_arithmetic_redux_basic.cpp
+</td>
+<td>
+Output: \include tut_arithmetic_redux_basic.out
+</td></tr></table>
+
+The \em trace of a matrix, as returned by the function \c trace(), is the sum of the diagonal coefficients and can also be computed as efficiently using <tt>a.diagonal().sum()</tt>, as we will see later on.
+
+\section TutorialReductionsVisitorsBroadcastingVisitors Visitors
+Visitors are useful when the location of a coefficient inside a Matrix or 
+\link ArrayBase Array \endlink wants to be obtained. The simplest example are the
+\link MatrixBase::maxCoeff() maxCoeff(&x,&y) \endlink and 
+\link MatrixBase::minCoeff() minCoeff(&x,&y) \endlink, that can be used to find
+the location of the greatest or smallest coefficient in a Matrix or 
+\link ArrayBase Array \endlink:
+
+The arguments passed to a visitor are pointers to the variables where the
+row and column position are to be stored. These variables are of type 
+\link DenseBase::Index Index \endlink FIXME: link? ok?, as shown below:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_visitors.out
+</td></tr></table>
+
+Note that both functions also return the value of the minimum or maximum coefficient if needed,
+as if it was a typical reduction operation.
+
+\section TutorialReductionsVisitorsBroadcastingPartialReductions Partial reductions
+Partial reductions are reductions that can operate column- or row-wise on a Matrix or 
+\link ArrayBase Array \endlink, applying the reduction operation on each column or row and 
+returning a column or row-vector with the corresponding values. Partial reductions are applied 
+with \link DenseBase::colwise() colwise() \endlink or \link DenseBase::rowwise() rowwise() \endlink.
+
+A simple example is obtaining the sum of the elements 
+in each column in a given matrix, storing the result in a row-vector:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_colwise.out
+</td></tr></table>
+
+The same operation can be performed row-wise:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_rowwise.out
+</td></tr></table>
+
+<b>Note that column-wise operations return a 'row-vector' while row-wise operations
+return a 'column-vector'</b>
+
+\subsection TutorialReductionsVisitorsBroadcastingPartialReductionsCombined Combining partial reductions with other operations
+It is also possible to use the result of a partial reduction to do further processing.
+Here there is another example that aims to find the the column whose sum of elements is the maximum
+ within a matrix. With column-wise partial reductions this can be coded as:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_maxnorm.out
+</td></tr></table>
+
+The previous example applies the \link DenseBase::sum() sum() \endlink reduction on each column
+though the \link DenseBase::colwise() colwise() \endlink visitor, obtaining a new matrix whose
+size is 1x4.
+
+Therefore, if
+\f[
+\mbox{m} = \begin{bmatrix} 1 & 2 & 6 & 9 \\
+                    3 & 1 & 7 & 2 \end{bmatrix}
+\f]
+
+then
+
+\f[
+\mbox{m.colwise().sum()} = \begin{bmatrix} 4 & 3 & 13 & 11 \end{bmatrix}
+\f]
+
+The \link DenseBase::maxCoeff() maxCoeff() \endlink reduction is finally applied 
+to obtain the column index where the maximum sum is found, 
+which is the column index 2 (third column) in this case.
+
+
+\section TutorialReductionsVisitorsBroadcastingBroadcasting Broadcasting
+The concept behind broadcasting is similar to partial reductions, with the difference that broadcasting 
+constructs an expression where a vector (column or row) is interpreted as a matrix by replicating it in 
+one direction.
+
+A simple example is to add a certain column-vector to each column in a matrix. 
+This can be accomplished with:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.out
+</td></tr></table>
+
+It is important to point out that the vector to be added column-wise or row-wise must be of type Vector,
+and cannot be a Matrix. If this is not met then you will get compile-time error. This also means that
+broadcasting operations can only be applied with an object of type Vector, when operating with Matrix.
+The same applies for the \link ArrayBase Array \endlink class, where the equivalent for VectorXf is ArrayXf.
+
+Therefore, to perform the same operation row-wise we can do:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.out
+</td></tr></table>
+
+\subsection TutorialReductionsVisitorsBroadcastingBroadcastingCombined Combining broadcasting with other operations
+Broadcasting can also be combined with other operations, such as Matrix or \link ArrayBase Array \endlink operations, 
+reductions and partial reductions.
+
+Now that broadcasting, reductions and partial reductions have been introduced, we can dive into a more advanced example that finds
+the nearest neighbour of a vector <tt>v</tt> within the columns of matrix <tt>m</tt>. The Euclidean distance will be used in this example,
+computing the squared Euclidean distance with the partial reduction named \link DenseBase::squaredNorm() squaredNorm() \endlink:
+
+<table class="tutorial_code"><tr><td>
+\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.out
+</td></tr></table>
+
+The line that does the job is 
+\code
+  (m.colwise() - v).colwise().squaredNorm().minCoeff(&index);
+\endcode
+
+We will go step by step to understand what is happening:
+
+  - <tt>m.colwise() - v</tt> is a broadcasting operation, substracting <tt>v</tt> from each column in <tt>m</tt>. The result of this operation
+would be a new matrix whose size is the same as matrix <tt>m</tt>: \f[
+  \mbox{m.colwise() - v} = 
+  \begin{bmatrix}
+    -1 & 21 & 4 & 7 \\
+     0 & 8  & 4 & -1
+  \end{bmatrix}
+\f]
+
+  - <tt>(m.colwise() - v).colwise().squaredNorm()</tt> is a partial reduction, computing the squared norm column-wise. The result of
+this operation would be a row-vector where each coefficient is the squared Euclidean distance between each column in <tt>m</tt> and <tt>v</tt>: \f[
+  \mbox{(m.colwise() - v).colwise().squaredNorm()} =
+  \begin{bmatrix}
+     1 & 505 & 32 & 50
+  \end{bmatrix}
+\f]
+
+  - Finally, <tt>minCoeff(&index)</tt> is used to obtain the index of the column in <tt>m</tt> that is closer to <tt>v</tt> in terms of Euclidean
+distance.
+
+*/
+
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
new file mode 100644
index 000000000..334b4d852
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,4);
+  Eigen::VectorXf v(2);
+  
+  m << 1, 23, 6, 9,
+       3, 11, 7, 2;
+       
+  v << 2,
+       3;
+
+  MatrixXf::Index index;
+  // find nearest neighbour
+  (m.colwise() - v).colwise().squaredNorm().minCoeff(&index);
+
+  cout << "Nearest neighbour is column " << index << ":" << endl;
+  cout << m.col(index) << endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp~ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp~
new file mode 100644
index 000000000..de05ab961
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp~
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,4);
+  Eigen::VectorXf v(2);
+  
+  m << 1, 2, 6, 9,
+       3, 1, 7, 2;
+       
+  v << 2,
+       3;
+
+  MatrixXf::Index index;
+  // find nearest neighbour
+  (m.colwise() - v).colwise().squaredNorm().minCoeff(&index);
+
+  cout << "Nearest neighbour is column " << index << ":" << endl;
+  cout << m.col(index) << endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
new file mode 100644
index 000000000..e6c87c6a4
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
@@ -0,0 +1,21 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  Eigen::VectorXf v(2);
+  
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+         
+  v << 0,
+       1;
+       
+  //add v to each column of m
+  mat.colwise() += v;
+  
+  std::cout << "Broadcasting result: " << std::endl;
+  std::cout << mat << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp~ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp~
new file mode 100644
index 000000000..f4e6a0db0
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp~
@@ -0,0 +1,21 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  Eigen::MatrixXf v(2);
+  
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+         
+  v << 0,
+       1;
+       
+  //add v to each column of m
+  mat.colwise() += v;
+  
+  std::cout << "Broadcasting result: " << std::endl;
+  std::cout << mat << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
new file mode 100644
index 000000000..9959c7909
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
@@ -0,0 +1,20 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  Eigen::VectorXf v(4);
+  
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+         
+  v << 0,1,2,3;
+       
+  //add v to each row of m
+  mat.rowwise() += v;
+  
+  std::cout << "Broadcasting result: " << std::endl;
+  std::cout << mat << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
new file mode 100644
index 000000000..df6825663
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
@@ -0,0 +1,13 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  std::cout << "Column's maximum: " << std::endl
+   << mat.colwise().maxCoeff() << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
new file mode 100644
index 000000000..cb46887b6
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  int   maxIndex;
+  float maxNorm = mat.colwise().sum().maxCoeff(&maxIndex);
+  
+  std::cout << "Maximum sum at position " << maxIndex << std::endl;
+
+  std::cout << "The corresponding vector is: " << std::endl;
+  std::cout << mat.col( maxIndex ) << std::endl;
+  std::cout << "And its sum is is: " << maxNorm << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
new file mode 100644
index 000000000..80427c9f7
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
@@ -0,0 +1,13 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+int main()
+{
+  Eigen::MatrixXf mat(2,4);
+  mat << 1, 2, 6, 9,
+         3, 1, 7, 2;
+  
+  std::cout << "Row's maximum: " << std::endl
+   << mat.rowwise().maxCoeff() << std::endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
new file mode 100644
index 000000000..b54e9aa31
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,2);
+  
+  m << 1, 2,
+       3, 4;
+
+  //get location of maximum
+  MatrixXf::Index maxRow, maxCol;
+  float max = m.maxCoeff(&maxRow, &maxCol);
+
+  //get location of minimum
+  MatrixXf::Index minRow, minCol;
+  float min = m.minCoeff(&minRow, &minCol);
+
+  cout << "Max: " << max <<  ", at: " <<
+     maxRow << "," << maxCol << endl;
+  cout << "Min: " << min << ", at: " <<
+     minRow << "," << minCol << endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp~ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp~
new file mode 100644
index 000000000..ff7ed9ee4
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp~
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <Eigen/Dense>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  Eigen::MatrixXf m(2,2);
+  
+  m << 1, 2,
+       3, 4;
+
+  //get location of maximum
+  MatrixXf::Index maxRow, maxCol;
+  m.maxCoeff(&maxRow, &maxCol);
+
+  //get location of minimum
+  MatrixXf::Index minRow, minCol;
+  m.minCoeff(&minRow, &minCol);
+
+  cout << "Max at: " <<
+     maxRow << "," << maxCol << endl;
+  cout << "Min at: " <<
+     minRow << "," << minCol << endl;
+}

From cb3aad1d91082dcf5edc6e5fe0fe883bbe816ebc Mon Sep 17 00:00:00 2001
From: Carlos Becker <carlosbecker@gmail.com>
Date: Thu, 8 Jul 2010 17:45:25 +0100
Subject: [PATCH 04/17] Reductions/Broadcasting/Visitor Tutorial added to index

---
 doc/Overview.dox | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/Overview.dox b/doc/Overview.dox
index 72a30ac69..8cf4e098f 100644
--- a/doc/Overview.dox
+++ b/doc/Overview.dox
@@ -24,7 +24,7 @@ For a first contact with Eigen, the best place is to have a look at the \ref Get
     - \ref TutorialBlockOperations
     - \ref TutorialAdvancedInitialization
     - \ref TutorialLinearAlgebra
-    - Coming soon: "Reductions, visitors, and broadcasting"
+    - \ref TutorialReductionsVisitorsBroadcasting
     - \ref TutorialGeometry
     - \ref TutorialSparse
   - \ref QuickRefPage

From 951da96f146f3c6dc806a9e3d0fbbd96bbeb495b Mon Sep 17 00:00:00 2001
From: Carlos Becker <carlosbecker@gmail.com>
Date: Thu, 8 Jul 2010 18:16:39 +0100
Subject: [PATCH 05/17] Added more redux types/examples in tutorial and fixed
 some display issues

---
 ...TutorialReductionsVisitorsBroadcasting.dox | 51 +++++++++++++++----
 ...nsVisitorsBroadcasting_reductions_bool.cpp | 24 +++++++++
 ...nsVisitorsBroadcasting_reductions_norm.cpp | 28 ++++++++++
 ...sVisitorsBroadcasting_reductions_norm.cpp~ | 28 ++++++++++
 4 files changed, 122 insertions(+), 9 deletions(-)
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
 create mode 100644 doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp~

diff --git a/doc/C07_TutorialReductionsVisitorsBroadcasting.dox b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
index 17124cb3b..b2c5b62cb 100644
--- a/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
@@ -11,6 +11,8 @@ This tutorial explains Eigen's reductions, visitors and broadcasting and how the
 
 \b Table \b of \b contents
   - \ref TutorialReductionsVisitorsBroadcastingReductions
+    - \ref TutorialReductionsVisitorsBroadcastingReductionsNorm
+    - \ref TutorialReductionsVisitorsBroadcastingReductionsBool
     - FIXME: .redux()
   - \ref TutorialReductionsVisitorsBroadcastingVisitors
   - \ref TutorialReductionsVisitorsBroadcastingPartialReductions
@@ -21,7 +23,7 @@ This tutorial explains Eigen's reductions, visitors and broadcasting and how the
 
 \section TutorialReductionsVisitorsBroadcastingReductions Reductions
 In Eigen, a reduction is a function that is applied to a certain matrix or array, returning a single 
-value of type scalar. One of the most used reductions is \link MatrixBase::sum() .sum() \endlink,
+value of type scalar. One of the most used reductions is \link DenseBase::sum() .sum() \endlink,
 which returns the addition of all the coefficients inside a given matrix or array.
 
 <table class="tutorial_code"><tr><td>
@@ -33,6 +35,37 @@ Output: \include tut_arithmetic_redux_basic.out
 
 The \em trace of a matrix, as returned by the function \c trace(), is the sum of the diagonal coefficients and can also be computed as efficiently using <tt>a.diagonal().sum()</tt>, as we will see later on.
 
+
+\subsection TutorialReductionsVisitorsBroadcastingReductionsNorm Norm reductions
+Eigen also provides reductions to obtain the norm or squared norm of a vector with \link DenseBase::norm() norm() \endlink and \link DenseBase::squaredNorm() squaredNorm() \endlink respectively.
+These operations can also operate on objects such as Matrices or Arrays, as shown in the following example:
+
+<table class="tutorial_code"><tr><td>
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.out
+</td></tr></table>
+
+\subsection TutorialReductionsVisitorsBroadcastingReductionsBool Boolean-like reductions
+
+Another interesting type of reductions are the ones that deal with \b true and \b false values:
+  - \link DenseBase::all() all() \endlink returns \b true if all of the coefficients in a given Matrix or \link ArrayBase Array \endlink are \b true .
+  - \link DenseBase::any() any() \endlink returns \b true if at least one of the coefficients in a given Matrix or \link ArrayBase Array \endlink are \b true .
+  - \link DenseBase::count() count() \endlink returns the number of \b true coefficients in a given Matrix or \link ArrayBase Array \endlink.
+
+Their behaviour can be seen in the following example:
+
+<table class="tutorial_code"><tr><td>
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
+</td>
+<td>
+Output:
+\verbinclude Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.out
+</td></tr></table>
+
+
 \section TutorialReductionsVisitorsBroadcastingVisitors Visitors
 Visitors are useful when the location of a coefficient inside a Matrix or 
 \link ArrayBase Array \endlink wants to be obtained. The simplest example are the
@@ -43,10 +76,10 @@ the location of the greatest or smallest coefficient in a Matrix or
 
 The arguments passed to a visitor are pointers to the variables where the
 row and column position are to be stored. These variables are of type 
-\link DenseBase::Index Index \endlink FIXME: link? ok?, as shown below:
+\link DenseBase::Index Index \endlink (FIXME: link ok?), as shown below:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_visitors.cpp
 </td>
 <td>
 Output:
@@ -66,7 +99,7 @@ A simple example is obtaining the sum of the elements
 in each column in a given matrix, storing the result in a row-vector:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_colwise.cpp
 </td>
 <td>
 Output:
@@ -76,7 +109,7 @@ Output:
 The same operation can be performed row-wise:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_rowwise.cpp
 </td>
 <td>
 Output:
@@ -92,7 +125,7 @@ Here there is another example that aims to find the the column whose sum of elem
  within a matrix. With column-wise partial reductions this can be coded as:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_maxnorm.cpp
 </td>
 <td>
 Output:
@@ -129,7 +162,7 @@ A simple example is to add a certain column-vector to each column in a matrix.
 This can be accomplished with:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple.cpp
 </td>
 <td>
 Output:
@@ -144,7 +177,7 @@ The same applies for the \link ArrayBase Array \endlink class, where the equival
 Therefore, to perform the same operation row-wise we can do:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_broadcast_simple_rowwise.cpp
 </td>
 <td>
 Output:
@@ -160,7 +193,7 @@ the nearest neighbour of a vector <tt>v</tt> within the columns of matrix <tt>m<
 computing the squared Euclidean distance with the partial reduction named \link DenseBase::squaredNorm() squaredNorm() \endlink:
 
 <table class="tutorial_code"><tr><td>
-\include Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
+Example: \include Tutorial_ReductionsVisitorsBroadcasting_broadcast_1nn.cpp
 </td>
 <td>
 Output:
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
new file mode 100644
index 000000000..10916877f
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_bool.cpp
@@ -0,0 +1,24 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  MatrixXf m(2,2), n(2,2);
+  
+  m << 0,2,
+       3,4;
+
+  n << 1,2,
+       3,4;
+  
+  cout << "m.all()   = " << m.all() << endl;
+  cout << "m.any()   = " << m.any() << endl;
+  cout << "m.count() = " << m.count() << endl;
+  cout << endl;
+  cout << "n.all()   = " << n.all() << endl;
+  cout << "n.any()   = " << n.any() << endl;
+  cout << "n.count() = " << n.count() << endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
new file mode 100644
index 000000000..f3364d7fc
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp
@@ -0,0 +1,28 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  VectorXf v(2);
+  MatrixXf m(2,2), n(2,2);
+  
+  v << 5,
+       10;
+  
+  m << 2,2,
+       3,4;
+
+  n << 1, 2,
+       32,12;
+  
+  cout << "v.norm() = " << v.norm() << endl;
+  cout << "m.norm() = " << m.norm() << endl;
+  cout << "n.norm() = " << n.norm() << endl;
+  cout << endl;
+  cout << "v.squaredNorm() = " << v.squaredNorm() << endl;
+  cout << "m.squaredNorm() = " << m.squaredNorm() << endl;
+  cout << "n.squaredNorm() = " << n.squaredNorm() << endl;
+}
diff --git a/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp~ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp~
new file mode 100644
index 000000000..03057f8d2
--- /dev/null
+++ b/doc/examples/Tutorial_ReductionsVisitorsBroadcasting_reductions_norm.cpp~
@@ -0,0 +1,28 @@
+#include <Eigen/Dense>
+#include <iostream>
+
+using namespace std;
+using namespace Eigen;
+
+int main()
+{
+  VectorXf v(2);
+  MatrixXf m(2,2), n(2,2);
+  
+  v << 2,
+       5;
+  
+  m << 0,2,
+       3,4;
+
+  n << 1,2,
+       3,4;
+  
+  cout << "v.norm() = " << m.norm() << endl;
+  cout << "m.norm() = " << m.norm() << endl;
+  cout << "n.norm() = " << m.norm() << endl;
+  cout << endl;
+  cout << "v.squaredNorm() = " << v.squaredNorm() << endl;
+  cout << "m.squaredNorm() = " << m.squaredNorm() << endl;
+  cout << "n.squaredNorm() = " << n.squaredNorm() << endl;
+}

From 551cb9b7b4e56c51b56404723d582b7438a06bf9 Mon Sep 17 00:00:00 2001
From: Thomas Capricelli <orzel@freehackers.org>
Date: Fri, 9 Jul 2010 03:59:36 +0200
Subject: [PATCH 06/17] bench: use of Eigen/Array is deprecated + fix includes
 for iostream

---
 bench/basicbench.cxxlist   | 17 ++++++++++++-----
 bench/basicbenchmark.cpp   |  1 +
 bench/benchBlasGemm.cpp    |  3 ++-
 bench/benchCholesky.cpp    |  4 +++-
 bench/benchEigenSolver.cpp |  4 +++-
 bench/benchFFT.cpp         |  2 ++
 bench/benchVecAdd.cpp      |  1 +
 bench/bench_norm.cpp       |  3 ++-
 bench/bench_reverse.cpp    |  3 ++-
 bench/bench_sum.cpp        |  1 +
 bench/benchmark.cpp        |  5 ++++-
 bench/benchmarkSlice.cpp   |  4 +++-
 bench/benchmarkX.cpp       |  3 +++
 bench/benchmarkXcwise.cpp  |  1 +
 bench/quat_slerp.cpp       |  4 +++-
 bench/sparse_cholesky.cpp  |  1 +
 bench/vdw_new.cpp          |  3 ++-
 17 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/bench/basicbench.cxxlist b/bench/basicbench.cxxlist
index 5d670ef4b..a8ab34e0d 100644
--- a/bench/basicbench.cxxlist
+++ b/bench/basicbench.cxxlist
@@ -4,18 +4,25 @@
 # CLIST[((g++))]="g++-3.4 -O3 -DNDEBUG -finline-limit=20000"
 
 # CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG"
-CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG -finline-limit=20000"
+#CLIST[((g++))]="g++-4.1 -O3 -DNDEBUG -finline-limit=20000"
 
 # CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG"
-CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000"
+#CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000"
 # CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000 -fprofile-generate"
 # CLIST[((g++))]="g++-4.2 -O3 -DNDEBUG -finline-limit=20000 -fprofile-use"
 
 # CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG"
-CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000"
+#CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000"
 # CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000 -fprofile-generate"
 # CLIST[((g++))]="g++-4.3 -O3 -DNDEBUG -finline-limit=20000 -fprofile-use"
 
-# CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size"
 # CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -prof-genx"
-# CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -prof-use"
\ No newline at end of file
+# CLIST[((g++))]="icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -prof-use"
+
+#CLIST[((g++))]="/opt/intel/Compiler/11.1/072/bin/intel64/icpc -fast -DNDEBUG -fno-exceptions -no-inline-max-size -lrt"
+CLIST[((g++))]="/home/orzel/svn/llvm/Release/bin/clang++ -O3 -DNDEBUG -DEIGEN_DONT_VECTORIZE -lrt"
+CLIST[((g++))]="/home/orzel/svn/llvm/Release/bin/clang++ -O3 -DNDEBUG -lrt"
+CLIST[((g++))]="g++-4.4.4 -O3 -DNDEBUG -DEIGEN_DONT_VECTORIZE -lrt"
+CLIST[((g++))]="g++-4.4.4 -O3 -DNDEBUG -lrt"
+CLIST[((g++))]="g++-4.5.0 -O3 -DNDEBUG -DEIGEN_DONT_VECTORIZE -lrt"
+CLIST[((g++))]="g++-4.5.0 -O3 -DNDEBUG -lrt"
diff --git a/bench/basicbenchmark.cpp b/bench/basicbenchmark.cpp
index bd500c06a..a26ea853f 100644
--- a/bench/basicbenchmark.cpp
+++ b/bench/basicbenchmark.cpp
@@ -1,4 +1,5 @@
 
+#include <iostream>
 #include "BenchUtil.h"
 #include "basicbenchmark.h"
 
diff --git a/bench/benchBlasGemm.cpp b/bench/benchBlasGemm.cpp
index a4a9e780a..85fbdb219 100644
--- a/bench/benchBlasGemm.cpp
+++ b/bench/benchBlasGemm.cpp
@@ -7,7 +7,8 @@
 // #define EIGEN_DEFAULT_TO_ROW_MAJOR
 #define _FLOAT
 
-#include <Eigen/Array>
+#include <iostream>
+
 #include <Eigen/Core>
 #include "BenchTimer.h"
 
diff --git a/bench/benchCholesky.cpp b/bench/benchCholesky.cpp
index c722d6b98..ca9bb20bc 100644
--- a/bench/benchCholesky.cpp
+++ b/bench/benchCholesky.cpp
@@ -8,7 +8,9 @@
 //  -DTRIES=10
 //  -DSCALAR=double
 
-#include <Eigen/Array>
+#include <iostream>
+
+#include <Eigen/Core>
 #include <Eigen/Cholesky>
 #include <bench/BenchUtil.h>
 using namespace Eigen;
diff --git a/bench/benchEigenSolver.cpp b/bench/benchEigenSolver.cpp
index 395bff3f9..de2fa600e 100644
--- a/bench/benchEigenSolver.cpp
+++ b/bench/benchEigenSolver.cpp
@@ -9,7 +9,9 @@
 //  -DTRIES=10
 //  -DSCALAR=double
 
-#include <Eigen/Array>
+#include <iostream>
+
+#include <Eigen/Core>
 #include <Eigen/QR>
 #include <bench/BenchUtil.h>
 using namespace Eigen;
diff --git a/bench/benchFFT.cpp b/bench/benchFFT.cpp
index 0f0c9bb93..c3d5da7a1 100644
--- a/bench/benchFFT.cpp
+++ b/bench/benchFFT.cpp
@@ -22,6 +22,8 @@
 // License and a copy of the GNU General Public License along with
 // Eigen. If not, see <http://www.gnu.org/licenses/>.
 
+#include <iostream>
+
 #include <bench/BenchUtil.h>
 #include <complex>
 #include <vector>
diff --git a/bench/benchVecAdd.cpp b/bench/benchVecAdd.cpp
index 396ab6a63..755c75ed5 100644
--- a/bench/benchVecAdd.cpp
+++ b/bench/benchVecAdd.cpp
@@ -1,4 +1,5 @@
 
+#include <iostream>
 #include <Eigen/Core>
 #include <bench/BenchTimer.h>
 using namespace Eigen;
diff --git a/bench/bench_norm.cpp b/bench/bench_norm.cpp
index 7a3dc2e68..1436ddbf5 100644
--- a/bench/bench_norm.cpp
+++ b/bench/bench_norm.cpp
@@ -1,5 +1,6 @@
 #include <typeinfo>
-#include <Eigen/Array>
+#include <iostream>
+#include <Eigen/Core>
 #include "BenchTimer.h"
 using namespace Eigen;
 using namespace std;
diff --git a/bench/bench_reverse.cpp b/bench/bench_reverse.cpp
index 2cedc0d3d..0e695b2f2 100644
--- a/bench/bench_reverse.cpp
+++ b/bench/bench_reverse.cpp
@@ -1,5 +1,6 @@
 
-#include <Eigen/Array>
+#include <iostream>
+#include <Eigen/Core>
 #include <bench/BenchUtil.h>
 using namespace Eigen;
 
diff --git a/bench/bench_sum.cpp b/bench/bench_sum.cpp
index 0e16a8eea..a3d925e4f 100644
--- a/bench/bench_sum.cpp
+++ b/bench/bench_sum.cpp
@@ -1,3 +1,4 @@
+#include <iostream>
 #include <Eigen/Core>
 using namespace Eigen;
 using namespace std;
diff --git a/bench/benchmark.cpp b/bench/benchmark.cpp
index 971de4961..c721b9081 100644
--- a/bench/benchmark.cpp
+++ b/bench/benchmark.cpp
@@ -1,5 +1,8 @@
 // g++ -O3 -DNDEBUG -DMATSIZE=<x> benchmark.cpp -o benchmark && time ./benchmark
-#include <Eigen/Array>
+
+#include <iostream>
+
+#include <Eigen/Core>
 
 #ifndef MATSIZE
 #define MATSIZE 3
diff --git a/bench/benchmarkSlice.cpp b/bench/benchmarkSlice.cpp
index 8c3d4dd10..3d04b6fd5 100644
--- a/bench/benchmarkSlice.cpp
+++ b/bench/benchmarkSlice.cpp
@@ -1,6 +1,8 @@
 // g++ -O3 -DNDEBUG benchmarkX.cpp -o benchmarkX && time ./benchmarkX
 
-#include <Eigen/Array>
+#include <iostream>
+
+#include <Eigen/Core>
 
 using namespace std;
 using namespace Eigen;
diff --git a/bench/benchmarkX.cpp b/bench/benchmarkX.cpp
index 89452b435..8e4b60c2b 100644
--- a/bench/benchmarkX.cpp
+++ b/bench/benchmarkX.cpp
@@ -1,4 +1,7 @@
 // g++ -fopenmp -I .. -O3 -DNDEBUG -finline-limit=1000 benchmarkX.cpp -o b && time ./b
+
+#include <iostream>
+
 #include <Eigen/Core>
 
 using namespace std;
diff --git a/bench/benchmarkXcwise.cpp b/bench/benchmarkXcwise.cpp
index 6a487b409..62437435e 100644
--- a/bench/benchmarkXcwise.cpp
+++ b/bench/benchmarkXcwise.cpp
@@ -1,5 +1,6 @@
 // g++ -O3 -DNDEBUG benchmarkX.cpp -o benchmarkX && time ./benchmarkX
 
+#include <iostream>
 #include <Eigen/Core>
 
 using namespace std;
diff --git a/bench/quat_slerp.cpp b/bench/quat_slerp.cpp
index 029443704..27a2067ab 100644
--- a/bench/quat_slerp.cpp
+++ b/bench/quat_slerp.cpp
@@ -1,4 +1,5 @@
 
+#include <iostream>
 #include <Eigen/Geometry>
 #include <bench/BenchTimer.h>
 using namespace Eigen;
@@ -242,4 +243,5 @@ int main()
   BENCH(slerp_legacy_nlerp);
   BENCH(slerp_rw);
   BENCH(slerp_gael);
-}
\ No newline at end of file
+}
+
diff --git a/bench/sparse_cholesky.cpp b/bench/sparse_cholesky.cpp
index ec8078e4c..4b8ff34f8 100644
--- a/bench/sparse_cholesky.cpp
+++ b/bench/sparse_cholesky.cpp
@@ -1,5 +1,6 @@
 // #define EIGEN_TAUCS_SUPPORT
 // #define EIGEN_CHOLMOD_SUPPORT
+#include <iostream>
 #include <Eigen/Sparse>
 
 // g++ -DSIZE=10000 -DDENSITY=0.001  sparse_cholesky.cpp -I.. -DDENSEMATRI -O3 -g0 -DNDEBUG   -DNBTRIES=1 -I /home/gael/Coding/LinearAlgebra/taucs_full/src/ -I/home/gael/Coding/LinearAlgebra/taucs_full/build/linux/  -L/home/gael/Coding/LinearAlgebra/taucs_full/lib/linux/ -ltaucs /home/gael/Coding/LinearAlgebra/GotoBLAS/libgoto.a -lpthread -I /home/gael/Coding/LinearAlgebra/SuiteSparse/CHOLMOD/Include/ $CHOLLIB -I /home/gael/Coding/LinearAlgebra/SuiteSparse/UFconfig/ /home/gael/Coding/LinearAlgebra/SuiteSparse/CCOLAMD/Lib/libccolamd.a   /home/gael/Coding/LinearAlgebra/SuiteSparse/CHOLMOD/Lib/libcholmod.a -lmetis /home/gael/Coding/LinearAlgebra/SuiteSparse/AMD/Lib/libamd.a  /home/gael/Coding/LinearAlgebra/SuiteSparse/CAMD/Lib/libcamd.a   /home/gael/Coding/LinearAlgebra/SuiteSparse/CCOLAMD/Lib/libccolamd.a  /home/gael/Coding/LinearAlgebra/SuiteSparse/COLAMD/Lib/libcolamd.a -llapack && ./a.out
diff --git a/bench/vdw_new.cpp b/bench/vdw_new.cpp
index b7f74a68f..d2604049f 100644
--- a/bench/vdw_new.cpp
+++ b/bench/vdw_new.cpp
@@ -1,4 +1,5 @@
-#include <Eigen/Array>
+#include <iostream>
+#include <Eigen/Core>
 
 using namespace Eigen;
 

From 2c03ca3325f462c5d7a9383eca3dd988b3b47ae9 Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Fri, 9 Jul 2010 11:46:07 +0100
Subject: [PATCH 07/17] Small changes to tutorial page 2 (matrix arithmetic): *
 slightly more extensive discussion of aliasing * layout: put example code and
 output side-by-side * add some links, etc

---
 doc/C02_TutorialMatrixArithmetic.dox         | 54 +++++++++++++-------
 doc/I11_Aliasing.dox                         | 12 +++++
 doc/snippets/tut_arithmetic_redux_minmax.cpp |  6 ++-
 3 files changed, 52 insertions(+), 20 deletions(-)
 create mode 100644 doc/I11_Aliasing.dox

diff --git a/doc/C02_TutorialMatrixArithmetic.dox b/doc/C02_TutorialMatrixArithmetic.dox
index e566fe451..df2360d40 100644
--- a/doc/C02_TutorialMatrixArithmetic.dox
+++ b/doc/C02_TutorialMatrixArithmetic.dox
@@ -27,7 +27,7 @@ or through special methods such as dot(), cross(), etc.
 For the Matrix class (matrices and vectors), operators are only overloaded to support
 linear-algebraic operations. For example, \c matrix1 \c * \c matrix2 means matrix-matrix product,
 and \c vector \c + \c scalar is just not allowed. If you want to perform all kinds of array operations,
-not linear algebra, see \ref TutorialArrayClass "next page".
+not linear algebra, see the \ref TutorialArrayClass "next page".
 
 \section TutorialArithmeticAddSub Addition and subtraction
 
@@ -39,8 +39,12 @@ also have the same \c Scalar type, as Eigen doesn't do automatic type promotion.
 \li compound operator += as in \c a+=b
 \li compound operator -= as in \c a-=b
 
+<table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_add_sub.cpp
-Output: \verbinclude tut_arithmetic_add_sub.out
+</td>
+<td>
+Output: \include tut_arithmetic_add_sub.out
+</td></tr></table>
 
 \section TutorialArithmeticScalarMulDiv Scalar multiplication and division
 
@@ -51,12 +55,17 @@ Multiplication and division by a scalar is very simple too. The operators at han
 \li compound operator *= as in \c matrix*=scalar
 \li compound operator /= as in \c matrix/=scalar
 
+<table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_scalar_mul_div.cpp
-Output: \verbinclude tut_arithmetic_scalar_mul_div.out
+</td>
+<td>
+Output: \include tut_arithmetic_scalar_mul_div.out
+</td></tr></table>
+
 
 \section TutorialArithmeticMentionXprTemplates A note about expression templates
 
-This is an advanced topic that we explain in \ref TopicEigenExpressionTemplates "this page",
+This is an advanced topic that we explain on \ref TopicEigenExpressionTemplates "this page",
 but it is useful to just mention it now. In Eigen, arithmetic operators such as \c operator+ don't
 perform any computation by themselves, they just return an "expression object" describing the computation to be
 performed. The actual computation happens later, when the whole expression is evaluated, typically in \c operator=.
@@ -78,7 +87,7 @@ more opportunities for optimization.
 
 \section TutorialArithmeticTranspose Transposition and conjugation
 
-The \c transpose \f$ a^T \f$, \c conjugate \f$ \bar{a} \f$, and the \c adjoint (i.e., conjugate transpose) of the matrix or vector \f$ a \f$, are simply obtained by the functions of the same names.
+The transpose \f$ a^T \f$, conjugate \f$ \bar{a} \f$, and adjoint (i.e., conjugate transpose) \f$ a^* \f$ of a matrix or vector \f$ a \f$ are obtained by the member functions \link DenseBase::transpose() transpose()\endlink, \link MatrixBase::conjugate() conjugate()\endlink, and \link MatrixBase::adjoint() adjoint()\endlink, respectively.
 
 <table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_transpose_conjugate.cpp
@@ -89,21 +98,23 @@ Output: \include tut_arithmetic_transpose_conjugate.out
 
 For real matrices, \c conjugate() is a no-operation, and so \c adjoint() is 100% equivalent to \c transpose().
 
-As for basic arithmetic operators, \c transpose and \c adjoint simply return a proxy object without doing the actual transposition. Therefore, <tt>a=a.transpose()</tt> leads to an unexpected result:
+As for basic arithmetic operators, \c transpose() and \c adjoint() simply return a proxy object without doing the actual transposition. If you do <tt>b = a.transpose()</tt>, then the transpose is evaluated at the same time as the result is written into \c b. However, there is a complication here. If you do <tt>a = a.transpose()</tt>, then Eigen starts writing the result into \c a before the evaluation of the transpose is finished. Therefore, the instruction <tt>a = a.transpose()</tt> does not replace \c a with its transpose, as one would expect:
 <table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_transpose_aliasing.cpp
 </td>
 <td>
 Output: \include tut_arithmetic_transpose_aliasing.out
 </td></tr></table>
-In "debug mode", i.e., when assertions have not been disabled, such common pitfalls are automatically detected. For \em in-place transposition, simply use the transposeInPlace() function:
+This is the so-called \ref TopicAliasing "aliasing issue". In "debug mode", i.e., when \ref TopicAssertions "assertions" have not been disabled, such common pitfalls are automatically detected. 
+
+For \em in-place transposition, as for instance in <tt>a = a.transpose()</tt>, simply use the \link DenseBase::transposeInPlace() transposeInPlace()\endlink  function:
 <table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_transpose_inplace.cpp
 </td>
 <td>
 Output: \include tut_arithmetic_transpose_inplace.out
 </td></tr></table>
-There is also the adjointInPlace() function for complex matrix.
+There is also the \link MatrixBase::adjointInPlace() adjointInPlace()\endlink function for complex matrices.
 
 \section TutorialArithmeticMatrixMul Matrix-matrix and matrix-vector multiplication
 
@@ -112,10 +123,14 @@ case of matrices, they are implicitly handled there too, so matrix-vector produc
 case of matrix-matrix product, and so is vector-vector outer product. Thus, all these cases are handled by just
 two operators:
 \li binary operator * as in \c a*b
-\li compound operator *= as in \c a*=b
+\li compound operator *= as in \c a*=b (this multiplies on the right: \c a*=b is equivalent to <tt>a = a*b</tt>)
 
+<table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_matrix_mul.cpp
-Output: \verbinclude tut_arithmetic_matrix_mul.out
+</td>
+<td>
+Output: \include tut_arithmetic_matrix_mul.out
+</td></tr></table>
 
 Note: if you read the above paragraph on expression templates and are worried that doing \c m=m*m might cause
 aliasing issues, be reassured for now: Eigen treats matrix multiplication as a special case and takes care of
@@ -124,7 +139,7 @@ introducing a temporary here, so it will compile \c m=m*m as:
 tmp = m*m;
 m = tmp;
 \endcode
-If you know your matrix product can be safely evaluated into the destination matrix without aliasing issue, then you can use the \c noalias() function to avoid the temporary, e.g.:
+If you know your matrix product can be safely evaluated into the destination matrix without aliasing issue, then you can use the \link MatrixBase::noalias() noalias()\endlink function to avoid the temporary, e.g.:
 \code
 c.noalias() += a * b;
 \endcode
@@ -134,16 +149,20 @@ For more details on this topic, see \ref TopicEigenExpressionTemplates "this pag
 
 \section TutorialArithmeticDotAndCross Dot product and cross product
 
-The above-discussed \c operator* does not allow to compute dot and cross products. For that, you need the dot() and cross() methods.
+The above-discussed \c operator* cannot be used to compute dot and cross products directly. For that, you need the \link MatrixBase::dot() dot()\endlink and \link MatrixBase::cross() cross()\endlink methods.
+<table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_dot_cross.cpp
-Output: \verbinclude tut_arithmetic_dot_cross.out
+</td>
+<td>
+Output: \include tut_arithmetic_dot_cross.out
+</td></tr></table>
 
 Remember that cross product is only for vectors of size 3. Dot product is for vectors of any sizes.
 When using complex numbers, Eigen's dot product is conjugate-linear in the first variable and linear in the
 second variable.
 
 \section TutorialArithmeticRedux Basic arithmetic reduction operations
-Eigen also provides some reduction operations to reduce a given matrix or vector to a single value such as the sum (<tt>a.sum()</tt>), product (<tt>a.prod()</tt>), or the maximum (<tt>a.maxCoeff()</tt>) and minimum (<tt>a.minCoeff()</tt>) of all its coefficients.
+Eigen also provides some reduction operations to reduce a given matrix or vector to a single value such as the sum (computed by \link DenseBase::sum() sum()\endlink), product (\link DenseBase::prod() prod()\endlink), or the maximum (\link DenseBase::maxCoeff() maxCoeff()\endlink) and minimum (\link DenseBase::minCoeff() minCoeff()\endlink) of all its coefficients.
 
 <table class="tutorial_code"><tr><td>
 Example: \include tut_arithmetic_redux_basic.cpp
@@ -152,7 +171,7 @@ Example: \include tut_arithmetic_redux_basic.cpp
 Output: \include tut_arithmetic_redux_basic.out
 </td></tr></table>
 
-The \em trace of a matrix, as returned by the function \c trace(), is the sum of the diagonal coefficients and can also be computed as efficiently using <tt>a.diagonal().sum()</tt>, as we will see later on.
+The \em trace of a matrix, as returned by the function \link MatrixBase::trace() trace()\endlink, is the sum of the diagonal coefficients and can also be computed as efficiently using <tt>a.diagonal().sum()</tt>, as we will see later on.
 
 There also exist variants of the \c minCoeff and \c maxCoeff functions returning the coordinates of the respective coefficient via the arguments:
 
@@ -166,7 +185,7 @@ Output: \include tut_arithmetic_redux_minmax.out
 
 \section TutorialArithmeticValidity Validity of operations
 Eigen checks the validity of the operations that you perform. When possible,
-it checks them at compile-time, producing compilation errors. These error messages can be long and ugly,
+it checks them at compile time, producing compilation errors. These error messages can be long and ugly,
 but Eigen writes the important message in UPPERCASE_LETTERS_SO_IT_STANDS_OUT. For example:
 \code
   Matrix3f m;
@@ -175,8 +194,7 @@ but Eigen writes the important message in UPPERCASE_LETTERS_SO_IT_STANDS_OUT. Fo
 \endcode
 
 Of course, in many cases, for example when checking dynamic sizes, the check cannot be performed at compile time.
-Eigen then uses runtime assertions. This means that executing an illegal operation will result in a crash at runtime,
-with an error message.
+Eigen then uses runtime assertions. This means that the program will abort with an error message when executing an illegal operation if it is run in "debug mode", and it will probably crash if assertions are turned off.
 
 \code
   MatrixXf m(3,3);
diff --git a/doc/I11_Aliasing.dox b/doc/I11_Aliasing.dox
new file mode 100644
index 000000000..29092c508
--- /dev/null
+++ b/doc/I11_Aliasing.dox
@@ -0,0 +1,12 @@
+namespace Eigen {
+
+/** \page TopicAliasing Aliasing
+
+What is aliasing? The noalias() and eval() member functions. Which operations are safe and which are not.
+
+TODO: write this dox page!
+
+Is linked from the tutorial on matrix arithmetic.
+
+*/
+}
diff --git a/doc/snippets/tut_arithmetic_redux_minmax.cpp b/doc/snippets/tut_arithmetic_redux_minmax.cpp
index b7b71b077..f4ae7f406 100644
--- a/doc/snippets/tut_arithmetic_redux_minmax.cpp
+++ b/doc/snippets/tut_arithmetic_redux_minmax.cpp
@@ -2,9 +2,11 @@
   std::ptrdiff_t i, j;
   float minOfM = m.minCoeff(&i,&j);
   cout << "Here is the matrix m:\n" << m << endl;
-  cout << "Its minimum coefficient (" << minOfM << ") is at position (" << i << "," << j << ")\n\n";
+  cout << "Its minimum coefficient (" << minOfM 
+       << ") is at position (" << i << "," << j << ")\n\n";
 
   RowVector4i v = RowVector4i::Random();
   int maxOfV = v.maxCoeff(&i);
   cout << "Here is the vector v: " << v << endl;
-  cout << "Its maximum coefficient (" << maxOfV << ") is at position " << i << endl;
\ No newline at end of file
+  cout << "Its maximum coefficient (" << maxOfV 
+       << ") is at position " << i << endl;

From 26cfe5a9584fea56539afa705c9e27ac286412ca Mon Sep 17 00:00:00 2001
From: Jitse Niesen <jitse@maths.leeds.ac.uk>
Date: Fri, 9 Jul 2010 11:59:29 +0100
Subject: [PATCH 08/17] Be consistent in how the tutorial pages link together.

---
 doc/C06_TutorialLinearAlgebra.dox                  | 4 ++--
 doc/C07_TutorialReductionsVisitorsBroadcasting.dox | 6 ++++--
 doc/C08_TutorialGeometry.dox                       | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/C06_TutorialLinearAlgebra.dox b/doc/C06_TutorialLinearAlgebra.dox
index f4c3ecc01..c471a6d04 100644
--- a/doc/C06_TutorialLinearAlgebra.dox
+++ b/doc/C06_TutorialLinearAlgebra.dox
@@ -4,7 +4,7 @@ namespace Eigen {
     \ingroup Tutorial
 
 \li \b Previous: \ref TutorialAdvancedInitialization
-\li \b Next: \ref TutorialGeometry
+\li \b Next: \ref TutorialReductionsVisitorsBroadcasting
 
 This tutorial explains how to solve linear systems, compute various decompositions such as LU,
 QR, %SVD, eigendecompositions... for more advanced topics, don't miss our special page on
@@ -240,7 +240,7 @@ decomposition after you've changed the threshold.
 </tr>
 </table>
 
-\li \b Next: TutorialGeometry
+\li \b Next: \ref TutorialReductionsVisitorsBroadcasting
 
 */
 
diff --git a/doc/C07_TutorialReductionsVisitorsBroadcasting.dox b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
index b2c5b62cb..1930d7a94 100644
--- a/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
+++ b/doc/C07_TutorialReductionsVisitorsBroadcasting.dox
@@ -3,8 +3,8 @@ namespace Eigen {
 /** \page TutorialReductionsVisitorsBroadcasting Tutorial page 7 - Reductions, visitors and broadcasting
     \ingroup Tutorial
 
-\li \b Previous: \ref TutorialAdvancedInitialization
-\li \b Next: \ref TutorialLinearAlgebra
+\li \b Previous: \ref TutorialLinearAlgebra
+\li \b Next: \ref TutorialGeometry
 
 This tutorial explains Eigen's reductions, visitors and broadcasting and how they are used with
 \link MatrixBase matrices \endlink and \link ArrayBase arrays \endlink.
@@ -227,6 +227,8 @@ this operation would be a row-vector where each coefficient is the squared Eucli
   - Finally, <tt>minCoeff(&index)</tt> is used to obtain the index of the column in <tt>m</tt> that is closer to <tt>v</tt> in terms of Euclidean
 distance.
 
+\li \b Next: \ref TutorialGeometry
+
 */
 
 }
diff --git a/doc/C08_TutorialGeometry.dox b/doc/C08_TutorialGeometry.dox
index 7d0e9f835..f8a53dc67 100644
--- a/doc/C08_TutorialGeometry.dox
+++ b/doc/C08_TutorialGeometry.dox
@@ -3,7 +3,7 @@ namespace Eigen {
 /** \page TutorialGeometry Tutorial page 8 - Geometry
     \ingroup Tutorial
 
-\li \b Previous: \ref TutorialLinearAlgebra
+\li \b Previous: \ref TutorialReductionsVisitorsBroadcasting
 \li \b Next: \ref TutorialSparse
 
 In this tutorial, we will shortly introduce the many possibilities offered by the \ref Geometry_Module "geometry module", namely 2D and 3D rotations and projective or affine transformations.

From d9e134c73c0978d06da98e05b6fc913ee594bceb Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Fri, 9 Jul 2010 17:54:41 +0300
Subject: [PATCH 09/17] Altivec port of Complex.h. Note: For some reason g++
 4.4 is >200% slower than g++ 4.3 on altivec code. The same benchmark
 (bench_gemm) was tested, on the same hardware/OS (G4/Debian testing), with
 same CFLAGS. With some code reorganizing I managed to get some minor gain on
 4.4, but I just could not reach 4.3 speed. This is most likely a bug, but I'm
 waiting to see if it's fixed on 4.5. I'll look into this a bit more.

---
 Eigen/src/Core/arch/AltiVec/Complex.h | 216 ++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 Eigen/src/Core/arch/AltiVec/Complex.h

diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
new file mode 100644
index 000000000..62d1d45d1
--- /dev/null
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_COMPLEX_ALTIVEC_H
+#define EIGEN_COMPLEX_ALTIVEC_H
+
+static Packet4ui  ei_p4ui_CONJ_XOR = vec_mergeh((Packet4ui)ei_p4i_ZERO, (Packet4ui)ei_p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet16uc ei_p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc ei_p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)ei_p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc ei_p16uc_COMPLEX_REV  = vec_sld(ei_p16uc_REVERSE, ei_p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc ei_p16uc_COMPLEX_REV2 = vec_sld(ei_p16uc_FORWARD, ei_p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc ei_p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc ei_p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)ei_p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
+  if ((ptrdiff_t)&from % 16 == 0) {
+    res.v = ei_pload((const float *)&from);
+    res.v = vec_perm(res.v, res.v, ei_p16uc_PSET_HI);
+  } else {
+    res.v = ei_ploadu((const float *)&from);
+    res.v = vec_perm(res.v, res.v, ei_p16uc_PSET_LO);
+  }
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) { return Packet2cf(ei_psub<Packet4f>(ei_p4f_ZERO, a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, ei_p4ui_CONJ_XOR)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  v1 = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_RE);
+  // Get the imaginary parts of a
+  v2 = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_IM);
+  // multiply a_re * b 
+  v1 = vec_madd(v1, b.v, ei_p4f_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(v2, b.v, ei_p4f_ZERO);
+  v2 = (Packet4f) vec_xor((Packet4ui)v2, ei_p4ui_CONJ_XOR);
+  // permute back to a proper order
+  v2 = vec_perm(v2, v2, ei_p16uc_COMPLEX_REV);
+  
+  return Packet2cf(vec_add(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  ei_pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  ei_pstore((float *)&res, a.v);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
+{
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, ei_p16uc_COMPLEX_REV2);
+  return Packet2cf(rev_a);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf sum;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  sum = ei_padd(a, Packet2cf(b));
+  return ei_pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f b1, b2;
+  
+  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = (Packet4f) vec_sld(b2, b2, 8);
+  b2 = ei_padd(b1, b2);
+
+  return Packet2cf(b2);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  Packet4f b;
+  Packet2cf prod;
+  b = (Packet4f) vec_sld(a.v, a.v, 8);
+  prod = ei_pmul(a, Packet2cf(b));
+
+  return ei_pfirst(prod);
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vec_sld(first.v, second.v, 8);
+    }
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(a, ei_pconj(b));
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(ei_pconj(a), b);
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pconj(ei_pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s = vec_madd(b.v, b.v, ei_p4f_ZERO);
+  return Packet2cf(ei_pdiv(res.v, vec_add(s,vec_perm(s, s, ei_p16uc_COMPLEX_REV))));
+}
+
+#endif // EIGEN_COMPLEX_ALTIVEC_H

From f6bd508351ae8d96acfdf3f6c506010271e29de9 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Fri, 9 Jul 2010 17:56:53 +0300
Subject: [PATCH 10/17] forgot to add the Complex.h include for AltiVec.

---
 Eigen/Core | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Eigen/Core b/Eigen/Core
index 3135d7530..c143b7798 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -224,6 +224,7 @@ using std::size_t;
   #include "src/Core/arch/SSE/Complex.h"
 #elif defined EIGEN_VECTORIZE_ALTIVEC
   #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
 #endif

From 642cc27eb1b8d8440ca35f5f08934cce22cd99dc Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Fri, 9 Jul 2010 18:08:18 +0300
Subject: [PATCH 11/17] forgot to commit ei_p4f_FORWARD;

---
 Eigen/src/Core/arch/AltiVec/PacketMath.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 9e6a375ea..ad694150a 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -74,6 +74,7 @@ typedef __vector unsigned char  Packet16uc;
 static Packet4f ei_p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
 static Packet4i ei_p4i_COUNTDOWN = { 3, 2, 1, 0 };
 static Packet16uc ei_p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+static Packet16uc ei_p16uc_FORWARD = vec_lvsl(0, (float*)0);
 
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);

From b2effa2b2c464a2e8cb1c4d5b71843b6514a50ae Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 9 Jul 2010 18:05:57 +0200
Subject: [PATCH 12/17] move ei_conj_if to a more appropriate file

---
 Eigen/src/Core/util/BlasUtil.h | 12 ++++++++++++
 Eigen/src/Core/util/Meta.h     | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 38c86511c..12ac6994f 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -89,6 +89,18 @@ template<typename RealScalar> struct ei_conj_helper<std::complex<RealScalar>, st
   { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); }
 };
 
+template<bool Conjugate> struct ei_conj_if;
+
+template<> struct ei_conj_if<true> {
+  template<typename T>
+  inline T operator()(const T& x) { return ei_conj(x); }
+};
+
+template<> struct ei_conj_if<false> {
+  template<typename T>
+  inline const T& operator()(const T& x) { return x; }
+};
+
 // Lightweight helper class to access matrix coefficients.
 // Yes, this is somehow redundant with Map<>, but this version is much much lighter,
 // and so I hope better compilation performance (time and code quality).
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index 9d73ef51c..b01ceafb2 100644
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -222,16 +222,4 @@ template<typename T> struct ei_is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct ei_is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
-template<bool Conjugate> struct ei_conj_if;
-
-template<> struct ei_conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) { return ei_conj(x); }
-};
-
-template<> struct ei_conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
-};
-
 #endif // EIGEN_META_H

From 96f9015807365d253e1c5ab8a82595973ebfa72e Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Fri, 9 Jul 2010 19:33:43 +0200
Subject: [PATCH 13/17] disable MSVC optimization when the underlying compiler
 is ICC

---
 Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 9382fbde5..a5d527271 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -264,13 +264,13 @@ template<> EIGEN_STRONG_INLINE void ei_prefetch<float>(const float*   addr) { _m
 template<> EIGEN_STRONG_INLINE void ei_prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void ei_prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
-#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
+#if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error.
 // Direct of the struct members fixed bug #62.
 template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
 template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
 template<> EIGEN_STRONG_INLINE int    ei_pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && (_MSC_VER <= 1500)
+#elif defined(_MSC_VER) && (_MSC_VER <= 1500) && !defined(__INTEL_COMPILER)
 // The temporary variable fixes an internal compilation error.
 template<> EIGEN_STRONG_INLINE float  ei_pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
 template<> EIGEN_STRONG_INLINE double ei_pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }

From 6ad3f1ab1f950b417788ca0b7bb15bde948c158c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@codex.gr>
Date: Sat, 10 Jul 2010 00:09:29 +0300
Subject: [PATCH 14/17] Added NEON/Complex.h, ~3.5x faster than scalar
 std::complex<float> minor fix in AltiVec Complex.h

---
 CMakeLists.txt                        |   2 +-
 Eigen/Core                            |   1 +
 Eigen/src/Core/arch/AltiVec/Complex.h |   5 +-
 Eigen/src/Core/arch/NEON/Complex.h    | 257 ++++++++++++++++++++++++++
 Eigen/src/Core/arch/NEON/PacketMath.h |   7 +-
 5 files changed, 265 insertions(+), 7 deletions(-)
 create mode 100644 Eigen/src/Core/arch/NEON/Complex.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d302386c..ab006d20f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,7 +150,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 
   option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
   if(EIGEN_TEST_NEON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon -mcpu=cortex-a8")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -mfpu=neon -mcpu=cortex-a8")
     message("Enabling NEON in tests/examples")
   endif()
 
diff --git a/Eigen/Core b/Eigen/Core
index c143b7798..e4b4ad6f0 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -227,6 +227,7 @@ using std::size_t;
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/Complex.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 62d1d45d1..2dba95a2f 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -132,10 +132,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
 template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
-  Packet2cf sum;
   b = (Packet4f) vec_sld(a.v, a.v, 8);
-  sum = ei_padd(a, Packet2cf(b));
-  return ei_pfirst(sum);
+  b = ei_padd(a.v, b);
+  return ei_pfirst(Packet2cf(sum));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
new file mode 100644
index 000000000..bf68a2bbb
--- /dev/null
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -0,0 +1,257 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_COMPLEX_ALTIVEC_H
+#define EIGEN_COMPLEX_ALTIVEC_H
+
+static uint32x4_t ei_p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static uint32x2_t ei_p2ui_CONJ_XOR = { 0x00000000, 0x80000000 };
+
+//---------- float ----------
+struct Packet2cf
+{
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  Packet4f  v;
+};
+
+template<> struct ei_packet_traits<std::complex<float> >  : ei_default_packet_traits
+{
+  typedef Packet2cf type;
+  enum {
+    Vectorizable = 1,
+    size = 2,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct ei_unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pset1<std::complex<float> >(const std::complex<float>&  from)
+{
+  float32x2_t r64;
+  r64 = vld1_f32((float *)&from);
+
+  return Packet2cf(vcombine_f32(r64, r64));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(ei_padd<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(ei_psub<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pnegate(const Packet2cf& a) { return Packet2cf(ei_pnegate<Packet4f>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pconj(const Packet2cf& a)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), ei_p4ui_CONJ_XOR)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f v1, v2;
+  float32x2_t a_lo, a_hi;
+
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
+  // Multiply the real a with b
+  v1 = vmulq_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f32(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), ei_p4ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  a_lo = vrev64_f32(vget_low_f32(v2));
+  a_hi = vrev64_f32(vget_high_f32(v2));
+  v2 = vcombine_f32(a_lo, a_hi);
+  // Add and return the result
+  return Packet2cf(vaddq_f32(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pload <std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(ei_pload((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ei_ploadu<std::complex<float> >(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ei_ploadu((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE void ei_pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE ei_pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((float*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void ei_prefetch<std::complex<float> >(const std::complex<float> *   addr) { __pld((float *)addr); }
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  ei_pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 x[2];
+  vst1q_f32((float *)x, a.v);
+  return x[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preverse(const Packet2cf& a)
+{
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r128;
+
+  a_lo = vget_low_f32(a.v);
+  a_hi = vget_high_f32(a.v);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return Packet2cf(a_r128);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+  a2 = vadd_f32(a1, a2);
+  vst1_f32((float *)&s, a2);
+
+  return s;
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
+  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
+  sum = vaddq_f32(sum1, sum2);
+
+  return Packet2cf(sum);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> ei_predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  float32x2_t a1, a2, v1, v2, prod;
+  std::complex<float> s;
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  v1 = vdup_lane_f32(a1, 0);
+  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  v2 = vdup_lane_f32(a1, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, a2);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, a2);
+  // Conjugate v2 
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), ei_p2ui_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add v1, v2
+  prod = vadd_f32(v1, v2);
+
+  vst1_f32((float *)&s, prod);
+
+  return s;
+}
+
+template<int Offset>
+struct ei_palign_impl<Offset,Packet2cf>
+{
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vextq_f32(first.v, second.v, 2);
+    }
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(a, ei_pconj(b));
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pmul(ei_pconj(a), b);
+  }
+};
+
+template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
+  { return ei_padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
+  {
+    return ei_pconj(ei_pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cf ei_pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  // TODO optimize it for AltiVec
+  Packet2cf res = ei_conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet4f s, rev_s;
+  float32x2_t a_lo, a_hi;
+
+  // this computes the norm
+  s = vmulq_f32(b.v, b.v);
+  a_lo = vrev64_f32(vget_low_f32(s));
+  a_hi = vrev64_f32(vget_high_f32(s));
+  rev_s = vcombine_f32(a_lo, a_hi);
+
+  return Packet2cf(ei_pdiv(res.v, vaddq_f32(s,rev_s)));
+}
+
+#endif // EIGEN_COMPLEX_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index aaa27b56d..66a6a30ee 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -63,7 +63,8 @@ template<> struct ei_packet_traits<float>  : ei_default_packet_traits
   enum {
     Vectorizable = 1,
     size = 4,
-
+   
+    HasDiv  = 1,
     // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
@@ -174,8 +175,8 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a,
 template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
 
 template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }

From 6dcd373b9d518c688b16800d9d0d7a88cb5f3dc2 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Fri, 9 Jul 2010 18:51:17 -0400
Subject: [PATCH 15/17] let ei_pset1 use _mm_loaddup_pd. Not a significant
 speed improvement, but also not a speed regression, and replaces 3
 instructions by 1 single instruction.

---
 Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a5d527271..dc28bdb56 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -109,8 +109,12 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) {
   return ei_vec4f_swizzle1(res,0,0,0,0);
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_pset1<double>(const double&  from) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_loaddup_pd(&from);
+#else
   Packet2d res = _mm_set_sd(from);
   return ei_vec2d_swizzle1(res, 0, 0);
+#endif
 }
 #else
 template<> EIGEN_STRONG_INLINE Packet4f ei_pset1<float>(const float&  from) { return _mm_set1_ps(from); }

From c4ef69b5bd46b106d001921867847a741bc6725b Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 10 Jul 2010 22:37:16 +0200
Subject: [PATCH 16/17] fix compilation: make the check_coordinates* functions
 const

---
 Eigen/src/Core/TriangularMatrix.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index c66bbb9fa..e2afa23b0 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -90,7 +90,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 
   protected:
 
-    void check_coordinates(Index row, Index col)
+    void check_coordinates(Index row, Index col) const
     {
       EIGEN_ONLY_USED_FOR_DEBUG(row);
       EIGEN_ONLY_USED_FOR_DEBUG(col);
@@ -102,12 +102,12 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     }
 
     #ifdef EIGEN_INTERNAL_DEBUGGING
-    void check_coordinates_internal(Index row, Index col)
+    void check_coordinates_internal(Index row, Index col) const
     {
       check_coordinates(row, col);
     }
     #else
-    void check_coordinates_internal(Index , Index ) {}
+    void check_coordinates_internal(Index , Index ) const {}
     #endif
 
 };

From e5bc9526f16590288edbc3e5fd8837ea81654942 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Sat, 10 Jul 2010 22:53:27 +0200
Subject: [PATCH 17/17] * generalize rowmajor by vector * fix weird compilation
 error when constructing a matrix with a row by matrix product

---
 Eigen/src/Core/Product.h                      | 13 ++++--
 Eigen/src/Core/SolveTriangular.h              |  9 ++--
 Eigen/src/Core/products/GeneralMatrixVector.h | 45 ++++++++++---------
 .../Core/products/TriangularMatrixVector.h    |  8 ++--
 Eigen/src/Core/util/BlasUtil.h                |  5 ++-
 test/product_large.cpp                        | 11 +++++
 6 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h
index ca30c4dce..139132c6b 100644
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@@ -324,6 +324,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     typedef typename ProductType::ActualRhsType ActualRhsType;
     typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
     typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef Map<Matrix<Scalar,Dynamic,1>, Aligned> MappedDest;
 
     ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
     ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
@@ -342,7 +343,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
     else
     {
       actualDest = ei_aligned_stack_new(Scalar,dest.size());
-      Map<typename Dest::PlainObject>(actualDest, dest.size()) = dest;
+      MappedDest(actualDest, dest.size()) = dest;
     }
 
     ei_cache_friendly_product_colmajor_times_vector
@@ -353,7 +354,7 @@ template<> struct ei_gemv_selector<OnTheRight,ColMajor,true>
 
     if (!EvalToDest)
     {
-      dest = Map<typename Dest::PlainObject>(actualDest, dest.size());
+      dest = MappedDest(actualDest, dest.size());
       ei_aligned_stack_delete(Scalar, actualDest, dest.size());
     }
   }
@@ -365,6 +366,7 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
   static void run(const ProductType& prod, Dest& dest, typename ProductType::Scalar alpha)
   {
     typedef typename ProductType::Scalar Scalar;
+    typedef typename ProductType::Index Index;
     typedef typename ProductType::ActualLhsType ActualLhsType;
     typedef typename ProductType::ActualRhsType ActualRhsType;
     typedef typename ProductType::_ActualRhsType _ActualRhsType;
@@ -394,9 +396,12 @@ template<> struct ei_gemv_selector<OnTheRight,RowMajor,true>
     }
 
     ei_cache_friendly_product_rowmajor_times_vector
-      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate>(
+      <LhsBlasTraits::NeedToConjugate,RhsBlasTraits::NeedToConjugate, Scalar, Index>(
+        actualLhs.rows(), actualLhs.cols(),
         &actualLhs.const_cast_derived().coeffRef(0,0), actualLhs.outerStride(),
-        rhs_data, prod.rhs().size(), dest, actualAlpha);
+        rhs_data, 1,
+        &dest.coeffRef(0,0), dest.innerStride(),
+        actualAlpha);
 
     if (!DirectlyUseRhs) ei_aligned_stack_delete(Scalar, rhs_data, prod.rhs().size());
   }
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 310e262d8..90ce2a802 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -80,12 +80,13 @@ struct ei_triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,NoUnrolling,RowMajor
         // 2 - it is slighlty faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
-        VectorBlock<Rhs,Dynamic> target(other,startRow,actualPanelWidth);
 
-        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false>(
+        ei_cache_friendly_product_rowmajor_times_vector<LhsProductTraits::NeedToConjugate,false,Scalar,Index>(
+          actualPanelWidth, r,
           &(actualLhs.const_cast_derived().coeffRef(startRow,startCol)), actualLhs.outerStride(),
-          &(other.coeffRef(startCol)), r,
-          target, Scalar(-1));
+          &(other.coeffRef(startCol)), other.innerStride(),
+          &other.coeffRef(startRow), other.innerStride(),
+          Scalar(-1));
       }
 
       for(Index k=0; k<actualPanelWidth; ++k)
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index e671a657e..0997746ef 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -258,13 +258,15 @@ void ei_cache_friendly_product_colmajor_times_vector(
 }
 
 // TODO add peeling to mask unaligned load/stores
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
+  Index rows, Index cols,
   const Scalar* lhs, Index lhsStride,
-  const Scalar* rhs, Index rhsSize,
-  ResType& res,
+  const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr,
   Scalar alpha)
 {
+  ei_internal_assert(rhsIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
@@ -291,22 +293,22 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   const Index peels = 2;
   const Index PacketAlignedMask = PacketSize-1;
   const Index PeelAlignedMask = PacketSize*peels-1;
-  const Index size = rhsSize;
+  const Index depth = cols;
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = ei_first_aligned(rhs, size);
-  Index alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  Index alignedStart = ei_first_aligned(rhs, depth);
+  Index alignedSize = PacketSize>1 ? alignedStart + ((depth-alignedStart) & ~PacketAlignedMask) : 0;
   const Index peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
   const Index alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(PacketSize/2) ? EvenAligned
-                       : FirstAligned;
+                         : alignmentStep==(PacketSize/2) ? EvenAligned
+                         : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = ei_first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = ei_first_aligned(lhs,depth);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
@@ -318,7 +320,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   }
   else if (PacketSize>1)
   {
-    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || size<PacketSize);
+    ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || depth<PacketSize);
 
     while (skipRows<PacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize))
@@ -331,26 +333,26 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
     }
     else
     {
-      skipRows = std::min(skipRows,Index(res.size()));
+      skipRows = std::min(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
     ei_internal_assert(  alignmentPattern==NoneAligned
                       || PacketSize==1
-                      || (skipRows + rowsAtOnce >= res.size())
-                      || PacketSize > rhsSize
+                      || (skipRows + rowsAtOnce >= rows)
+                      || PacketSize > depth
                       || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0);
   }
   else if(Vectorizable)
   {
     alignedStart = 0;
-    alignedSize = size;
+    alignedSize = depth;
     alignmentPattern = AllAligned;
   }
 
   Index offset1 = (FirstAligned && alignmentStep==1?3:1);
   Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
-  Index rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
     EIGEN_ALIGN16 Scalar tmp0 = Scalar(0);
@@ -439,17 +441,20 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
     // process remaining coeffs (or all if no explicit vectorization)
     // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<size; ++j)
+    for (Index j=alignedSize; j<depth; ++j)
     {
       Scalar b = rhs[j];
       tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
       tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
     }
-    res[i] += alpha*tmp0; res[i+offset1] += alpha*tmp1; res[i+2] += alpha*tmp2; res[i+offset3] += alpha*tmp3;
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
   }
 
   // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = res.size();
+  Index end = rows;
   Index start = rowBound;
   do
   {
@@ -477,9 +482,9 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<size; ++j)
+      for (Index j=alignedSize; j<depth; ++j)
         tmp0 += cj.pmul(lhs0[j], rhs[j]);
-      res[i] += alpha*tmp0;
+      res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
     {
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index abf8097d0..de4f0b7bb 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -119,11 +119,11 @@ struct ei_product_triangular_vector_selector<true,Lhs,Rhs,Result,Mode,ConjLhs,Co
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        Block<Result,Dynamic,1> target(res,pi,0,actualPanelWidth,1);
-        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs>(
+        ei_cache_friendly_product_rowmajor_times_vector<ConjLhs,ConjRhs,Scalar,Index>(
+            actualPanelWidth, r,
             &(lhs.const_cast_derived().coeffRef(pi,s)), lhs.outerStride(),
-            &(rhs.const_cast_derived().coeffRef(s)), r,
-            target, alpha);
+            &(rhs.const_cast_derived().coeffRef(s)), 1,
+            &res.coeffRef(pi,0), res.innerStride(), alpha);
       }
     }
   }
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index 12ac6994f..e53311100 100644
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -49,9 +49,10 @@ template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index,
 static void ei_cache_friendly_product_colmajor_times_vector(
   Index size, const Scalar* lhs, Index lhsStride, const RhsType& rhs, Scalar* res, Scalar alpha);
 
-template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index, typename ResType>
+template<bool ConjugateLhs, bool ConjugateRhs, typename Scalar, typename Index>
 static void ei_cache_friendly_product_rowmajor_times_vector(
-  const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsSize, ResType& res, Scalar alpha);
+  Index rows, Index Cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsIncr,
+  Scalar* res, Index resIncr, Scalar alpha);
 
 template<typename Scalar> struct ei_conj_helper<Scalar,Scalar,false,false>
 {
diff --git a/test/product_large.cpp b/test/product_large.cpp
index 2d36c5a92..2c5523913 100644
--- a/test/product_large.cpp
+++ b/test/product_large.cpp
@@ -64,5 +64,16 @@ void test_product_large()
     // only makes sure it compiles fine
     computeProductBlockingSizes<float,float>(k1,m1,n1);
   }
+
+  {
+    // test regression in row-vector by matrix (bad Map type)
+    MatrixXf mat1(10,32); mat1.setRandom();
+    MatrixXf mat2(32,32); mat2.setRandom();
+    MatrixXf r1 = mat1.row(2)*mat2.transpose();
+    VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval());
+
+    MatrixXf r2 = mat1.row(2)*mat2;
+    VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
+  }
 #endif
 }