From 102e029dad8b756d65f48e3cc0b8cdb07a48dd87 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 2 May 2008 13:30:12 +0000 Subject: [PATCH] Removed ei_pload1, use posix_memalign to allocate aligned memory, and make Product ok when only one side is vectorizable (and the product is still vectorized) --- Eigen/Core | 5 ++ Eigen/src/Core/MatrixStorage.h | 10 +++- Eigen/src/Core/PacketMath.h | 4 -- Eigen/src/Core/Product.h | 89 +++++++++++++++++++++++----------- 4 files changed, 73 insertions(+), 35 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index b22c0a0ba..02b2132e2 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -10,6 +10,11 @@ #endif #endif +#ifdef EIGEN_VECTORIZE +// it seems we cannot assume posix_memalign is defined in the stdlib header +extern "C" int posix_memalign (void **, size_t, size_t) throw (); +#endif + #include #include #include diff --git a/Eigen/src/Core/MatrixStorage.h b/Eigen/src/Core/MatrixStorage.h index c8ee7a62c..6f88a31d6 100644 --- a/Eigen/src/Core/MatrixStorage.h +++ b/Eigen/src/Core/MatrixStorage.h @@ -54,7 +54,13 @@ T* ei_aligned_malloc(size_t size) { #ifdef EIGEN_VECTORIZE if (ei_packet_traits::size>1) - return static_cast(_mm_malloc(sizeof(T)*size, 16)); + { + void* ptr; + if (posix_memalign(&ptr, 16, size*sizeof(T))==0) + return static_cast(ptr); + else + return 0; + } else #endif return new T[size]; @@ -65,7 +71,7 @@ void ei_aligned_free(T* ptr) { #ifdef EIGEN_VECTORIZE if (ei_packet_traits::size>1) - _mm_free(ptr); + free(ptr); else #endif delete[] ptr; diff --git a/Eigen/src/Core/PacketMath.h b/Eigen/src/Core/PacketMath.h index 0f3a132a9..3534452ff 100644 --- a/Eigen/src/Core/PacketMath.h +++ b/Eigen/src/Core/PacketMath.h @@ -90,10 +90,6 @@ inline __m128 ei_pload(const float* from) { return _mm_load_ps(from); } inline __m128d ei_pload(const double* from) { return _mm_load_pd(from); } inline __m128i ei_pload(const int* from) { return _mm_load_si128(reinterpret_cast(from)); } -inline __m128 ei_pload1(const float* from) { return _mm_load1_ps(from); } -inline __m128d ei_pload1(const double* from) { return _mm_load1_pd(from); } -inline __m128i ei_pload1(const int* from) { return _mm_set1_epi32(*from); } - inline __m128 ei_pset1(const float& from) { return _mm_set1_ps(from); } inline __m128d ei_pset1(const double& from) { return _mm_set1_pd(from); } inline __m128i ei_pset1(const int& from) { return _mm_set1_epi32(from); } diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index cdb4e88b4..b1892e1df 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -60,29 +60,44 @@ struct ei_product_unroller static void run(int, int, const Lhs&, const Rhs&, typename Lhs::Scalar&) {} }; - template -struct ei_packet_product_unroller +struct ei_packet_product_unroller; + +template +struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - ei_packet_product_unroller::run(row, col, lhs, rhs, res); - if (RowMajor) - res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col))); - else - res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)))); + ei_packet_product_unroller::run(row, col, lhs, rhs, res); + res = ei_padd(res, ei_pmul(ei_pset1(lhs.coeff(row, Index)), rhs.packetCoeff(Index, col))); } }; -template -struct ei_packet_product_unroller +template +struct ei_packet_product_unroller { static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - if (RowMajor) - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col)); - else - res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); + ei_packet_product_unroller::run(row, col, lhs, rhs, res); + res = ei_padd(res, ei_pmul(lhs.packetCoeff(row, Index), ei_pset1(rhs.coeff(Index, col)))); + } +}; + +template +struct ei_packet_product_unroller +{ + static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.packetCoeff(0, col)); + } +}; + +template +struct ei_packet_product_unroller +{ + static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) + { + res = ei_pmul(lhs.packetCoeff(row, 0), ei_pset1(rhs.coeff(0, col))); } }; @@ -92,6 +107,16 @@ struct ei_packet_product_unroller struct ProductPacketCoeffImpl { + inline static typename Product::PacketScalar execute(const Product& product, int row, int col) + { return product._packetCoeffRowMajor(row,col); } +}; + +template struct ProductPacketCoeffImpl { + inline static typename Product::PacketScalar execute(const Product& product, int row, int col) + { return product._packetCoeffColumnMajor(row,col); } +}; + /** \class Product * * \brief Expression of the product of two matrices @@ -158,6 +183,7 @@ template class Product : ei_no_assignm public: EIGEN_GENERIC_PUBLIC_INTERFACE(Product) + friend class ProductPacketCoeffImpl; typedef typename ei_traits::LhsNested LhsNested; typedef typename ei_traits::RhsNested RhsNested; typedef typename ei_traits::_LhsNested _LhsNested; @@ -202,32 +228,37 @@ template class Product : ei_no_assignm return res; } - PacketScalar _packetCoeff(int row, int col) const EIGEN_ALWAYS_INLINE + PacketScalar _packetCoeff(int row, int col) const { - PacketScalar res; if(Lhs::ColsAtCompileTime <= EIGEN_UNROLLING_LIMIT) { + PacketScalar res; ei_packet_product_unroller ::run(row, col, m_lhs, m_rhs, res); + return res; } else - { - if (Flags&RowMajorBit) - { - res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col)); - for(int i = 1; i < m_lhs.cols(); i++) - res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col))); - } - else - { - res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); - for(int i = 1; i < m_lhs.cols(); i++) - res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)))); - } - } + return ProductPacketCoeffImpl::execute(*this, row, col); + } + + PacketScalar _packetCoeffRowMajor(int row, int col) const + { + PacketScalar res; + res = ei_pmul(ei_pset1(m_lhs.coeff(row, 0)),m_rhs.packetCoeff(0, col)); + for(int i = 1; i < m_lhs.cols(); i++) + res = ei_padd(res, ei_pmul(ei_pset1(m_lhs.coeff(row, i)), m_rhs.packetCoeff(i, col))); + return res; + } + + PacketScalar _packetCoeffColumnMajor(int row, int col) const + { + PacketScalar res; + res = ei_pmul(m_lhs.packetCoeff(row, 0), ei_pset1(m_rhs.coeff(0, col))); + for(int i = 1; i < m_lhs.cols(); i++) + res = ei_padd(res, ei_pmul(m_lhs.packetCoeff(row, i), ei_pset1(m_rhs.coeff(i, col)))); return res; }