From 904f35d19497b5442a80a30c42d00fc52c22fe36 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Mon, 5 Oct 2009 17:22:16 +0200
Subject: [PATCH 1/2] discard vectorization in matrix-vector product when data
 is not even aligned on the scalar type size (e.g., for double on 32 bits
 system without -malign-double)

---
 Eigen/src/Core/products/GeneralMatrixVector.h | 35 +++++++++++++------
 test/testsuite.cmake                          |  2 +-
 2 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index 57875035a..a18e5ef1d 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -57,8 +57,7 @@ void ei_cache_friendly_product_colmajor_times_vector(
   if(ConjugateRhs)
     alpha = ei_conj(alpha);
 
-//   std::cerr << "prod " << size << " " << rhs.size() << "\n";
-
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
   const int PacketSize = sizeof(Packet)/sizeof(Scalar);
 
@@ -69,9 +68,9 @@ void ei_cache_friendly_product_colmajor_times_vector(
   const int PeelAlignedMask = PacketSize*peels-1;
 
   // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
-  const int alignedStart = ei_alignmentOffset(res,size);
-  const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  // Here we assume data are at least aligned on the base scalar type.
+  int alignedStart = ei_alignmentOffset(res,size);
+  int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
   const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
   const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
@@ -84,12 +83,18 @@ void ei_cache_friendly_product_colmajor_times_vector(
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   int skipColumns = 0;
-  if (PacketSize>1)
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(res)%sizeof(RealScalar)) )
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+  }
+  else if (PacketSize>1)
   {
     ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize);
 
     while (skipColumns<PacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize))
+          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize))
       ++skipColumns;
     if (skipColumns==PacketSize)
     {
@@ -263,6 +268,7 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
   ei_conj_helper<ConjugateLhs,ConjugateRhs> cj;
 
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename ei_packet_traits<Scalar>::type Packet;
   const int PacketSize = sizeof(Packet)/sizeof(Scalar);
 
@@ -274,9 +280,10 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
   const int size = rhsSize;
 
   // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type that is mandatory anyway.
-  const int alignedStart = ei_alignmentOffset(rhs, size);
-  const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
+  // Here we assume data are at least aligned on the base scalar type
+  // if that's not the case then vectorization is discarded, see below.
+  int alignedStart = ei_alignmentOffset(rhs, size);
+  int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0;
   const int peeledSize  = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
 
   const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0;
@@ -289,7 +296,13 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   int skipRows = 0;
-  if (PacketSize>1)
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (size_t(lhs)%sizeof(RealScalar)) || (size_t(rhs)%sizeof(RealScalar)) )
+  {
+    alignedSize = 0;
+    alignedStart = 0;
+  }
+  else if (PacketSize>1)
   {
     ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0  || size<PacketSize);
 
diff --git a/test/testsuite.cmake b/test/testsuite.cmake
index 147b82d46..b223345b1 100644
--- a/test/testsuite.cmake
+++ b/test/testsuite.cmake
@@ -177,7 +177,7 @@ if(WIN32 AND NOT UNIX)
     SET (CTEST_INITIAL_CACHE "
       MAKECOMMAND:STRING=nmake -i
       CMAKE_MAKE_PROGRAM:FILEPATH=nmake
-      CMAKE_GENERATOR:INTERNAL=NMake Makefiles  
+      CMAKE_GENERATOR:INTERNAL=NMake Makefiles
       CMAKE_BUILD_TYPE:STRING=Release
       BUILDNAME:STRING=${EIGEN_BUILD_STRING}
       SITE:STRING=${EIGEN_SITE}

From 4cf73660273fa8c1ea9db4e41bc576f93fd0ccc9 Mon Sep 17 00:00:00 2001
From: Gael Guennebaud <g.gael@free.fr>
Date: Tue, 6 Oct 2009 10:24:41 +0200
Subject: [PATCH 2/2] fix compilation in stable norm, move a platform check to
 the unit tests

---
 Eigen/src/Core/StableNorm.h | 14 ++++----------
 test/stable_norm.cpp        | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index facab9dbd..b4a5d0bfd 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -108,21 +108,15 @@ MatrixBase<Derived>::blueNorm() const
     iemax = std::numeric_limits<RealScalar>::max_exponent;  // maximum exponent
     rbig  = std::numeric_limits<RealScalar>::max();         // largest floating-point number
 
-    // Check the basic machine-dependent constants.
-    if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5)
-      || (it<=4 && ibeta <= 3 ) || it<2)
-    {
-      ei_assert(false && "the algorithm cannot be guaranteed on this computer");
-    }
     iexp  = -((1-iemin)/2);
-    b1    = RealScalar(std::pow(double(ibeta),iexp));  // lower boundary of midrange
+    b1    = RealScalar(std::pow<RealScalar>(ibeta,iexp));  // lower boundary of midrange
     iexp  = (iemax + 1 - it)/2;
-    b2    = RealScalar(std::pow(double(ibeta),iexp));   // upper boundary of midrange
+    b2    = RealScalar(std::pow<RealScalar>(ibeta,iexp));   // upper boundary of midrange
 
     iexp  = (2-iemin)/2;
-    s1m   = RealScalar(std::pow(double(ibeta),iexp));   // scaling factor for lower range
+    s1m   = RealScalar(std::pow<RealScalar>(ibeta,iexp));   // scaling factor for lower range
     iexp  = - ((iemax+it)/2);
-    s2m   = RealScalar(std::pow(double(ibeta),iexp));   // scaling factor for upper range
+    s2m   = RealScalar(std::pow<RealScalar>(ibeta,iexp));   // scaling factor for upper range
 
     overfl  = rbig*s2m;             // overfow boundary for abig
     eps     = RealScalar(std::pow(double(ibeta), 1-it));
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp
index 726512ec0..6ce99477d 100644
--- a/test/stable_norm.cpp
+++ b/test/stable_norm.cpp
@@ -33,6 +33,21 @@ template<typename MatrixType> void stable_norm(const MatrixType& m)
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
+  // Check the basic machine-dependent constants.
+  {
+    int nbig, ibeta, it, iemin, iemax, iexp;
+    RealScalar abig, eps;
+
+    ibeta = std::numeric_limits<RealScalar>::radix;         // base for floating-point numbers
+    it    = std::numeric_limits<RealScalar>::digits;        // number of base-beta digits in mantissa
+    iemin = std::numeric_limits<RealScalar>::min_exponent;  // minimum exponent
+    iemax = std::numeric_limits<RealScalar>::max_exponent;  // maximum exponent
+
+    VERIFY( (!(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5) || (it<=4 && ibeta <= 3 ) || it<2))
+           && "the stable norm algorithm cannot be guaranteed on this computer");
+  }
+
+
   int rows = m.rows();
   int cols = m.cols();