diff --git a/CMakeLists.txt b/CMakeLists.txt index 29d3896d8..c9d5af679 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,10 @@ IF(CMAKE_COMPILER_IS_GNUCXX) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2") MESSAGE("Enabling SSE2 in tests/examples") ENDIF(TEST_SSE2) + IF(TEST_SSE3) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3") + MESSAGE("Enabling SSE3 in tests/examples") + ENDIF(TEST_SSE3) IF(TEST_ALTIVEC) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec") MESSAGE("Enabling AltiVec in tests/examples") diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index d5604824f..1c292d104 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -52,6 +52,9 @@ private: InnerSize = int(Derived::Flags)&RowMajorBit ? Derived::ColsAtCompileTime : Derived::RowsAtCompileTime, + InnerMaxSize = int(Derived::Flags)&RowMajorBit + ? Derived::MaxColsAtCompileTime + : Derived::MaxRowsAtCompileTime, PacketSize = ei_packet_traits::size }; @@ -60,7 +63,9 @@ private: && ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)), MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0, MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit), - MaySliceVectorize = MightVectorize && InnerSize==Dynamic + MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only + want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case + of a dynamic block in a fixed-size matrix */ }; public: @@ -349,7 +354,7 @@ struct ei_assign_impl template struct ei_assign_impl { - inline static void run(Derived1 &dst, const Derived2 &src) + static void run(Derived1 &dst, const Derived2 &src) { const int size = Derived1::SizeAtCompileTime; const int packetSize = ei_packet_traits::size; @@ -383,8 +388,30 @@ struct ei_assign_impl { static void run(Derived1 &dst, const Derived2 &src) { - //FIXME unimplemented, so for now we fall back to non-vectorized path - ei_assign_impl::run(dst, src); + const int packetSize = ei_packet_traits::size; + const bool rowMajor = Derived1::Flags&RowMajorBit; + const int innerSize = rowMajor ? dst.cols() : dst.rows(); + const int outerSize = rowMajor ? dst.rows() : dst.cols(); + const int alignedInnerSize = (innerSize/packetSize)*packetSize; + + for(int i = 0; i < outerSize; i++) + { + // do the vectorizable part of the assignment + for (int index = 0; index(row, col, src.template packet(row, col)); + } + + // do the non-vectorizable part of the assignment + for (int index = alignedInnerSize; index struct ei_product_coeff_impl; -template +template struct ei_product_packet_impl; template class ei_product_eval_to_column_major; @@ -188,10 +188,6 @@ template class Product Unroll ? InnerSize-1 : Dynamic, _LhsNested, _RhsNested> ScalarCoeffImpl; - typedef ei_product_packet_impl PacketCoeffImpl; - public: template @@ -232,7 +228,10 @@ template class Product const PacketScalar _packet(int row, int col) const { PacketScalar res; - PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res); + ei_product_packet_impl + ::run(row, col, m_lhs, m_rhs, res); return res; } @@ -356,63 +355,63 @@ struct ei_product_coeff_impl *** Packet path *** *******************/ -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - ei_product_packet_impl::run(row, col, lhs, rhs, res); - res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet(Index, col), res); + ei_product_packet_impl::run(row, col, lhs, rhs, res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet(Index, col), res); } }; -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - ei_product_packet_impl::run(row, col, lhs, rhs, res); - res = ei_pmadd(lhs.template packet(row, Index), ei_pset1(rhs.coeff(Index, col)), res); + ei_product_packet_impl::run(row, col, lhs, rhs, res); + res = ei_pmadd(lhs.template packet(row, Index), ei_pset1(rhs.coeff(Index, col)), res); } }; -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); } }; -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res) { - res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); } }; -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) { - res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); for(int i = 1; i < lhs.cols(); i++) - res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); + res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); } }; -template -struct ei_product_packet_impl +template +struct ei_product_packet_impl { inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res) { - res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); + res = ei_pmul(lhs.template packet(row, 0), ei_pset1(rhs.coeff(0, col))); for(int i = 1; i < lhs.cols(); i++) - res = ei_pmadd(lhs.template packet(row, i), ei_pset1(rhs.coeff(i, col)), res); + res = ei_pmadd(lhs.template packet(row, i), ei_pset1(rhs.coeff(i, col)), res); } }; diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ffd6aebeb..03fa6bce5 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -94,7 +94,7 @@ inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinter inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); } inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); } -inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } +inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); } inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f89a07e3b..14ed29a3d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,7 +11,7 @@ ENDIF(CMAKE_COMPILER_IS_GNUCXX) OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF) -# similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it +# similar to SET_TARGET_PROPERTIES but append the property instead of overwriting it MACRO(EI_ADD_TARGET_PROPERTY target prop value) GET_TARGET_PROPERTY(previous ${target} ${prop})