Hey, finally the copyCoeff stuff is not only used to implement swap anymore :)

Add an internal pseudo expression allowing to optimize operators like +=, *= using the copyCoeff stuff. This allows to easily enforce aligned load for the destination matrix everywhere.
2025-09-11 00:43:13 +08:00 · 2009-11-20 15:39:38 +01:00 · 2009-11-20 15:39:38 +01:00 · eb8f450071
commit eb8f450071
parent e3d890bc5a
11 changed files with 229 additions and 51 deletions
--- a/Eigen/Core
+++ b/Eigen/Core
@ -167,6 +167,7 @@ struct Dense {};
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/Matrix.h"
+#include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -178,7 +178,9 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  return *this = *this - other;
+  SelfCwiseBinaryOp<ei_scalar_difference_op<Scalar>, Derived> tmp(derived());
+  tmp = other;
+  return derived();
 }

 /** replaces \c *this by \c *this + \a other.
@ -190,7 +192,9 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  return *this = *this + other;
+  SelfCwiseBinaryOp<ei_scalar_sum_op<Scalar>, Derived> tmp(derived());
+  tmp = other;
+  return derived();
 }

 #endif // EIGEN_CWISE_BINARY_OP_H
--- a/Eigen/src/Core/CwiseUnaryOps.h
+++ b/Eigen/src/Core/CwiseUnaryOps.h
@ -33,9 +33,17 @@ EIGEN_STRONG_INLINE const CwiseUnaryOp<ei_scalar_opposite_op<typename ei_traits<
 operator-() const { return derived(); }

 EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other)
-{ return *this = *this * other; }
+{
+  SelfCwiseBinaryOp<ei_scalar_product_op<Scalar>, Derived> tmp(derived());
+  tmp = PlainMatrixType::Constant(rows(),cols(),other);
+  return derived();
+}
 EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other)
-{ return *this = *this / other; }
+{
+  SelfCwiseBinaryOp<typename ei_meta_if<NumTraits<Scalar>::HasFloatingPoint,ei_scalar_product_op<Scalar>,ei_scalar_quotient_op<Scalar> >::ret, Derived> tmp(derived());
+  tmp = PlainMatrixType::Constant(rows(),cols(), NumTraits<Scalar>::HasFloatingPoint ? Scalar(1)/other : other);
+  return derived();
+}

 /** \returns an expression of \c *this scaled by the scalar factor \a scalar */
 EIGEN_STRONG_INLINE const ScalarMultipleReturnType
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -34,6 +34,22 @@
  * of generic vectorized code.
  */

+#ifndef EIGEN_DEBUG_ALIGNED_LOAD
+#define EIGEN_DEBUG_ALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_LOAD
+#define EIGEN_DEBUG_UNALIGNED_LOAD
+#endif
+
+#ifndef EIGEN_DEBUG_ALIGNED_STORE
+#define EIGEN_DEBUG_ALIGNED_STORE
+#endif
+
+#ifndef EIGEN_DEBUG_UNALIGNED_STORE
+#define EIGEN_DEBUG_UNALIGNED_STORE
+#endif
+
 struct ei_default_packet_traits
 {
  enum {
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -197,32 +197,6 @@ template<typename Derived> class MapBase
    using Base::operator=;
    using Base::operator*=;

-    // FIXME it seems VS does not allow to do "using Base::operator+="
-    // and to overload operator+= at the same time, therefore we have to
-    // explicitly add these two overloads.
-    // Maybe there exists a better solution though.
-    template<typename ProductDerived, typename Lhs,typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived,Lhs,Rhs>, 0, EvalBeforeAssigningBit>& other)
-    { return Base::operator+=(other); }
-
-    template<typename ProductDerived, typename Lhs,typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived,Lhs,Rhs>, 0, EvalBeforeAssigningBit>& other)
-    { return Base::operator-=(other); }
-
-    template<typename OtherDerived>
-    Derived& operator+=(const MatrixBase<OtherDerived>& other)
-    { return derived() = forceAligned() + other; }
-
-    template<typename OtherDerived>
-    Derived& operator-=(const MatrixBase<OtherDerived>& other)
-    { return derived() = forceAligned() - other; }
-
-    Derived& operator*=(const Scalar& other)
-    { return derived() = forceAligned() * other; }
-
-    Derived& operator/=(const Scalar& other)
-    { return derived() = forceAligned() / other; }
-
  protected:

    void checkDataAlignment() const
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef EIGEN_SELFCWISEBINARYOP_H
+#define EIGEN_SELFCWISEBINARYOP_H
+
+/** \class SelfCwiseBinaryOp
+  *
+  * \internal
+  *
+  * \brief Internal helper class for optimizing operators like +=, -=
+  */
+template<typename BinaryOp, typename MatrixType>
+struct ei_traits<SelfCwiseBinaryOp<BinaryOp,MatrixType> > : ei_traits<MatrixType> {};
+
+template<typename BinaryOp, typename MatrixType> class SelfCwiseBinaryOp
+  : public MatrixBase<SelfCwiseBinaryOp<BinaryOp,MatrixType> >
+{
+  public:
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
+    typedef typename ei_packet_traits<Scalar>::type Packet;
+
+    using Base::operator=;
+
+    inline SelfCwiseBinaryOp(MatrixType& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
+
+    inline int rows() const { return m_matrix.rows(); }
+    inline int cols() const { return m_matrix.cols(); }
+    inline int stride() const { return m_matrix.stride(); }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(int row, int col)
+    {
+      return m_matrix.const_cast_derived().coeffRef(row, col);
+    }
+
+    // note that this function is needed by assign to correctly align loads/stores
+    // TODO make Assign use .data()
+    inline Scalar& coeffRef(int index)
+    {
+      return m_matrix.const_cast_derived().coeffRef(index);
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(int row, int col, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(row >= 0 && row < rows()
+                         && col >= 0 && col < cols());
+      Scalar& tmp = m_matrix.coeffRef(row,col);
+      tmp = m_functor(tmp, _other.coeff(row,col));
+    }
+
+    template<typename OtherDerived>
+    void copyCoeff(int index, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(index >= 0 && index < m_matrix.size());
+      Scalar& tmp = m_matrix.coeffRef(index);
+      tmp = m_functor(tmp, _other.coeff(index));
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(int row, int col, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(row >= 0 && row < rows()
+                        && col >= 0 && col < cols());
+      m_matrix.template writePacket<StoreMode>(row, col,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
+    }
+
+    template<typename OtherDerived, int StoreMode, int LoadMode>
+    void copyPacket(int index, const MatrixBase<OtherDerived>& other)
+    {
+      OtherDerived& _other = other.const_cast_derived();
+      ei_internal_assert(index >= 0 && index < m_matrix.size());
+      m_matrix.template writePacket<StoreMode>(index,
+        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
+    }
+
+  protected:
+    MatrixType& m_matrix;
+    const BinaryOp& m_functor;
+
+  private:
+    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
+};
+
+#endif // EIGEN_SELFCWISEBINARYOP_H
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -172,14 +172,14 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_pandnot<Packet4f>(const Packet4f& a,
 template<> EIGEN_STRONG_INLINE Packet2d ei_pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i ei_pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }

-template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_pload<float>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_pload<double>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_pload<int>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }

 #if (!defined __GNUC__) && (!defined __ICC)
-template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { return _mm_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return _mm_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float*   from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu<double>(const double*  from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return EIGEN_DEBUG_UNALIGNED_LOAD _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
 #else
 // Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
 // require pointer casting to incompatible pointer types and leads to invalid code
@ -188,6 +188,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu<int>(const int* from) { return
 // TODO: do the same for MSVC (ICC is compatible)
 template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
  __m128 res;
  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
  asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
@ -195,6 +196,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ei_ploadu(const float* from)
 }
 template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
  __m128d res;
  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from) );
  asm volatile ("movhpd %[from1], %[r]" : [r] "+x" (res) : [from1] "m" (*(from+1)) );
@ -202,6 +204,7 @@ template<> EIGEN_STRONG_INLINE Packet2d ei_ploadu(const double* from)
 }
 template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 {
+  EIGEN_DEBUG_UNALIGNED_LOAD
  __m128i res;
  asm volatile ("movsd  %[from0], %[r]" : [r] "=x" (res) : [from0] "m" (*from), [dummy] "m" (*(from+1)) );
  asm volatile ("movhps %[from2], %[r]" : [r] "+x" (res) : [from2] "m" (*(from+2)), [dummy] "m" (*(from+3)) );
@ -209,16 +212,17 @@ template<> EIGEN_STRONG_INLINE Packet4i ei_ploadu(const int* from)
 }
 #endif

-template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void ei_pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }

 template<> EIGEN_STRONG_INLINE void ei_pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
  _mm_storel_pd((to), from);
  _mm_storeh_pd((to+1), from);
 }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { ei_pstoreu((double*)to, _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castps_pd(from)); }
+template<> EIGEN_STRONG_INLINE void ei_pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE ei_pstoreu((double*)to, _mm_castsi128_pd(from)); }

 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
 // The temporary variable fixes an internal compilation error.
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@ -48,6 +48,7 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
+template<typename BinOp, typename MatrixType>             class SelfCwiseBinaryOp;
 template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;

 template<typename Derived> class DiagonalBase;
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -99,6 +99,7 @@ ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
 ei_add_test(linearstructure)
 ei_add_test(cwiseop)
+ei_add_test(unalignedcount)
 ei_add_test(redux)
 ei_add_test(visitor)
 ei_add_test(product_small)
--- a/test/unalignedcount.cpp
+++ b/test/unalignedcount.cpp
@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// Eigen is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 3 of the License, or (at your option) any later version.
+//
+// Alternatively, you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation; either version 2 of
+// the License, or (at your option) any later version.
+//
+// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License and a copy of the GNU General Public License along with
+// Eigen. If not, see <http://www.gnu.org/licenses/>.
+
+static int nb_load;
+static int nb_loadu;
+static int nb_store;
+static int nb_storeu;
+
+#define EIGEN_DEBUG_ALIGNED_LOAD    { nb_load++;    }
+#define EIGEN_DEBUG_UNALIGNED_LOAD  { nb_loadu++;   }
+#define EIGEN_DEBUG_ALIGNED_STORE   { nb_store++;   }
+#define EIGEN_DEBUG_UNALIGNED_STORE { nb_storeu++;  }
+
+#define VERIFY_ALIGNED_UNALIGNED_COUNT(XPR,AL,UL,AS,US) {\
+    nb_load = nb_loadu = nb_store = nb_storeu = 0; \
+    XPR; \
+    if(!(nb_load==AL && nb_loadu==UL && nb_store==AS && nb_storeu==US)) \
+      std::cerr << " >> " << nb_load << ", " << nb_loadu << ", " << nb_store << ", " << nb_storeu << "\n"; \
+    VERIFY( (#XPR) && nb_load==AL && nb_loadu==UL && nb_store==AS && nb_storeu==US ); \
+  }
+
+
+#include "main.h"
+
+void test_unalignedcount()
+{
+  #ifdef EIGEN_VECTORIZE_SSE
+  VectorXf a(40), b(40);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 10, 10, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 10, 0, 10, 0);
+  VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 10, 0, 10, 0);
+  #endif
+}