mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
* implement slice vectorization. Because it uses unaligned
packet access, it is not certain that it will bring a performance improvement: benchmarking needed. * improve logic choosing slice vectorization. * fix typo in SSE packet math, causing crash in unaligned case. * fix bug in Product, causing crash in unaligned case. * add TEST_SSE3 CMake option.
This commit is contained in:
parent
8cef541b5a
commit
8a967fb17c
@ -22,6 +22,10 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
|
|||||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
|
||||||
MESSAGE("Enabling SSE2 in tests/examples")
|
MESSAGE("Enabling SSE2 in tests/examples")
|
||||||
ENDIF(TEST_SSE2)
|
ENDIF(TEST_SSE2)
|
||||||
|
IF(TEST_SSE3)
|
||||||
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
|
||||||
|
MESSAGE("Enabling SSE3 in tests/examples")
|
||||||
|
ENDIF(TEST_SSE3)
|
||||||
IF(TEST_ALTIVEC)
|
IF(TEST_ALTIVEC)
|
||||||
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")
|
||||||
MESSAGE("Enabling AltiVec in tests/examples")
|
MESSAGE("Enabling AltiVec in tests/examples")
|
||||||
|
@ -52,6 +52,9 @@ private:
|
|||||||
InnerSize = int(Derived::Flags)&RowMajorBit
|
InnerSize = int(Derived::Flags)&RowMajorBit
|
||||||
? Derived::ColsAtCompileTime
|
? Derived::ColsAtCompileTime
|
||||||
: Derived::RowsAtCompileTime,
|
: Derived::RowsAtCompileTime,
|
||||||
|
InnerMaxSize = int(Derived::Flags)&RowMajorBit
|
||||||
|
? Derived::MaxColsAtCompileTime
|
||||||
|
: Derived::MaxRowsAtCompileTime,
|
||||||
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
|
PacketSize = ei_packet_traits<typename Derived::Scalar>::size
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -60,7 +63,9 @@ private:
|
|||||||
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
|
&& ((int(Derived::Flags)&RowMajorBit)==(int(OtherDerived::Flags)&RowMajorBit)),
|
||||||
MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
|
MayInnerVectorize = MightVectorize && InnerSize!=Dynamic && int(InnerSize)%int(PacketSize)==0,
|
||||||
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
|
MayLinearVectorize = MightVectorize && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
|
||||||
MaySliceVectorize = MightVectorize && InnerSize==Dynamic
|
MaySliceVectorize = MightVectorize && InnerMaxSize==Dynamic /* slice vectorization can be slow, so we only
|
||||||
|
want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case
|
||||||
|
of a dynamic block in a fixed-size matrix */
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -349,7 +354,7 @@ struct ei_assign_impl<Derived1, Derived2, LinearVectorization, NoUnrolling>
|
|||||||
template<typename Derived1, typename Derived2>
|
template<typename Derived1, typename Derived2>
|
||||||
struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
|
struct ei_assign_impl<Derived1, Derived2, LinearVectorization, CompleteUnrolling>
|
||||||
{
|
{
|
||||||
inline static void run(Derived1 &dst, const Derived2 &src)
|
static void run(Derived1 &dst, const Derived2 &src)
|
||||||
{
|
{
|
||||||
const int size = Derived1::SizeAtCompileTime;
|
const int size = Derived1::SizeAtCompileTime;
|
||||||
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||||
@ -383,8 +388,30 @@ struct ei_assign_impl<Derived1, Derived2, SliceVectorization, NoUnrolling>
|
|||||||
{
|
{
|
||||||
static void run(Derived1 &dst, const Derived2 &src)
|
static void run(Derived1 &dst, const Derived2 &src)
|
||||||
{
|
{
|
||||||
//FIXME unimplemented, so for now we fall back to non-vectorized path
|
const int packetSize = ei_packet_traits<typename Derived1::Scalar>::size;
|
||||||
ei_assign_impl<Derived1, Derived2, NoVectorization, NoUnrolling>::run(dst, src);
|
const bool rowMajor = Derived1::Flags&RowMajorBit;
|
||||||
|
const int innerSize = rowMajor ? dst.cols() : dst.rows();
|
||||||
|
const int outerSize = rowMajor ? dst.rows() : dst.cols();
|
||||||
|
const int alignedInnerSize = (innerSize/packetSize)*packetSize;
|
||||||
|
|
||||||
|
for(int i = 0; i < outerSize; i++)
|
||||||
|
{
|
||||||
|
// do the vectorizable part of the assignment
|
||||||
|
for (int index = 0; index<alignedInnerSize ; index+=packetSize)
|
||||||
|
{
|
||||||
|
const int row = rowMajor ? i : index;
|
||||||
|
const int col = rowMajor ? index : i;
|
||||||
|
dst.template writePacket<UnAligned>(row, col, src.template packet<UnAligned>(row, col));
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the non-vectorizable part of the assignment
|
||||||
|
for (int index = alignedInnerSize; index<innerSize ; index++)
|
||||||
|
{
|
||||||
|
const int row = rowMajor ? i : index;
|
||||||
|
const int col = rowMajor ? index : i;
|
||||||
|
dst.coeffRef(row, col) = src.coeff(row, col);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ enum {
|
|||||||
template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
|
template<int VectorizationMode, int Index, typename Lhs, typename Rhs>
|
||||||
struct ei_product_coeff_impl;
|
struct ei_product_coeff_impl;
|
||||||
|
|
||||||
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
template<int StorageOrder, int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl;
|
struct ei_product_packet_impl;
|
||||||
|
|
||||||
template<typename T> class ei_product_eval_to_column_major;
|
template<typename T> class ei_product_eval_to_column_major;
|
||||||
@ -188,10 +188,6 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
|||||||
Unroll ? InnerSize-1 : Dynamic,
|
Unroll ? InnerSize-1 : Dynamic,
|
||||||
_LhsNested, _RhsNested> ScalarCoeffImpl;
|
_LhsNested, _RhsNested> ScalarCoeffImpl;
|
||||||
|
|
||||||
typedef ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
|
|
||||||
Unroll ? InnerSize-1 : Dynamic,
|
|
||||||
_LhsNested, _RhsNested, PacketScalar> PacketCoeffImpl;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs>
|
template<typename Lhs, typename Rhs>
|
||||||
@ -232,7 +228,10 @@ template<typename LhsNested, typename RhsNested, int ProductMode> class Product
|
|||||||
const PacketScalar _packet(int row, int col) const
|
const PacketScalar _packet(int row, int col) const
|
||||||
{
|
{
|
||||||
PacketScalar res;
|
PacketScalar res;
|
||||||
PacketCoeffImpl::run(row, col, m_lhs, m_rhs, res);
|
ei_product_packet_impl<Flags&RowMajorBit ? RowMajorProduct : ColMajorProduct,
|
||||||
|
Unroll ? InnerSize-1 : Dynamic,
|
||||||
|
_LhsNested, _RhsNested, PacketScalar, LoadMode>
|
||||||
|
::run(row, col, m_lhs, m_rhs, res);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -356,63 +355,63 @@ struct ei_product_coeff_impl<InnerVectorization, Index, Lhs, Rhs>
|
|||||||
*** Packet path ***
|
*** Packet path ***
|
||||||
*******************/
|
*******************/
|
||||||
|
|
||||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<RowMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||||
{
|
{
|
||||||
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
ei_product_packet_impl<RowMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<Aligned>(Index, col), res);
|
res = ei_pmadd(ei_pset1(lhs.coeff(row, Index)), rhs.template packet<LoadMode>(Index, col), res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<int Index, typename Lhs, typename Rhs, typename PacketScalar>
|
template<int Index, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<ColMajorProduct, Index, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||||
{
|
{
|
||||||
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar>::run(row, col, lhs, rhs, res);
|
ei_product_packet_impl<ColMajorProduct, Index-1, Lhs, Rhs, PacketScalar, LoadMode>::run(row, col, lhs, rhs, res);
|
||||||
res = ei_pmadd(lhs.template packet<Aligned>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
res = ei_pmadd(lhs.template packet<LoadMode>(row, Index), ei_pset1(rhs.coeff(Index, col)), res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<RowMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||||
{
|
{
|
||||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
|
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<ColMajorProduct, 0, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar &res)
|
||||||
{
|
{
|
||||||
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar>
|
template<int StorageOrder, typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<StorageOrder, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||||
{
|
{
|
||||||
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<Aligned>(0, col));
|
res = ei_pmul(ei_pset1(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
|
||||||
for(int i = 1; i < lhs.cols(); i++)
|
for(int i = 1; i < lhs.cols(); i++)
|
||||||
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<Aligned>(i, col), res);
|
res = ei_pmadd(ei_pset1(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Lhs, typename Rhs, typename PacketScalar>
|
template<typename Lhs, typename Rhs, typename PacketScalar, int LoadMode>
|
||||||
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar>
|
struct ei_product_packet_impl<ColMajorProduct, Dynamic, Lhs, Rhs, PacketScalar, LoadMode>
|
||||||
{
|
{
|
||||||
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
inline static void run(int row, int col, const Lhs& lhs, const Rhs& rhs, PacketScalar& res)
|
||||||
{
|
{
|
||||||
res = ei_pmul(lhs.template packet<Aligned>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
res = ei_pmul(lhs.template packet<LoadMode>(row, 0), ei_pset1(rhs.coeff(0, col)));
|
||||||
for(int i = 1; i < lhs.cols(); i++)
|
for(int i = 1; i < lhs.cols(); i++)
|
||||||
res = ei_pmadd(lhs.template packet<Aligned>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
res = ei_pmadd(lhs.template packet<LoadMode>(row, i), ei_pset1(rhs.coeff(i, col)), res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ inline void ei_pstore(int* to, const __m128i& from) { _mm_store_si128(reinter
|
|||||||
|
|
||||||
inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
|
inline void ei_pstoreu(float* to, const __m128& from) { _mm_storeu_ps(to, from); }
|
||||||
inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
|
inline void ei_pstoreu(double* to, const __m128d& from) { _mm_storeu_pd(to, from); }
|
||||||
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
inline void ei_pstoreu(int* to, const __m128i& from) { _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
|
||||||
inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); }
|
inline float ei_pfirst(const __m128& a) { return _mm_cvtss_f32(a); }
|
||||||
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
|
inline double ei_pfirst(const __m128d& a) { return _mm_cvtsd_f64(a); }
|
||||||
|
@ -11,7 +11,7 @@ ENDIF(CMAKE_COMPILER_IS_GNUCXX)
|
|||||||
|
|
||||||
OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
|
OPTION(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions" OFF)
|
||||||
|
|
||||||
# similar to SET_TARGET_PROPERTIES but append the property instead of overwritting it
|
# similar to SET_TARGET_PROPERTIES but append the property instead of overwriting it
|
||||||
MACRO(EI_ADD_TARGET_PROPERTY target prop value)
|
MACRO(EI_ADD_TARGET_PROPERTY target prop value)
|
||||||
|
|
||||||
GET_TARGET_PROPERTY(previous ${target} ${prop})
|
GET_TARGET_PROPERTY(previous ${target} ${prop})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user