mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-25 07:43:14 +08:00
re-enable fast pset1-pstore by introducing a new higher level pstore1 function
This commit is contained in:
parent
951e238430
commit
c8e1b679fa
@ -270,6 +270,14 @@ Packet psqrt(const Packet& a) { return sqrt(a); }
|
|||||||
* The following functions might not have to be overwritten for vectorized types
|
* The following functions might not have to be overwritten for vectorized types
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
|
/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
|
||||||
|
// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
|
||||||
|
template<typename Packet>
|
||||||
|
inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
|
||||||
|
{
|
||||||
|
pstore(to, pset1<Packet>(a));
|
||||||
|
}
|
||||||
|
|
||||||
/** \internal \returns a * b + c (coeff-wise) */
|
/** \internal \returns a * b + c (coeff-wise) */
|
||||||
template<typename Packet> inline Packet
|
template<typename Packet> inline Packet
|
||||||
pmadd(const Packet& a,
|
pmadd(const Packet& a,
|
||||||
|
@ -305,6 +305,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d&
|
|||||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castps_pd(from)); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castps_pd(from)); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castsi128_pd(from)); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castsi128_pd(from)); }
|
||||||
|
|
||||||
|
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
|
||||||
|
{
|
||||||
|
Packet4f pa = _mm_set_ss(a);
|
||||||
|
pstore(to, vec4f_swizzle1(pa,0,0,0,0));
|
||||||
|
}
|
||||||
|
// some compilers might be tempted to perform multiple moves instead of using a vector path.
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
|
||||||
|
{
|
||||||
|
Packet2d pa = _mm_set_sd(a);
|
||||||
|
pstore(to, vec2d_swizzle1(pa,0,0));
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||||
|
@ -199,7 +199,7 @@ public:
|
|||||||
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
||||||
{
|
{
|
||||||
for(DenseIndex k=0; k<n; k++)
|
for(DenseIndex k=0; k<n; k++)
|
||||||
pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
|
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||||
@ -270,7 +270,7 @@ public:
|
|||||||
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
||||||
{
|
{
|
||||||
for(DenseIndex k=0; k<n; k++)
|
for(DenseIndex k=0; k<n; k++)
|
||||||
pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
|
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||||
@ -363,8 +363,8 @@ public:
|
|||||||
{
|
{
|
||||||
if(Vectorizable)
|
if(Vectorizable)
|
||||||
{
|
{
|
||||||
pstore((RealScalar*)&b[k*ResPacketSize*2+0], pset1<RealPacket>(real(rhs[k])));
|
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k]));
|
||||||
pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], pset1<RealPacket>(imag(rhs[k])));
|
pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
b[k] = rhs[k];
|
b[k] = rhs[k];
|
||||||
@ -475,7 +475,7 @@ public:
|
|||||||
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
|
||||||
{
|
{
|
||||||
for(DenseIndex k=0; k<n; k++)
|
for(DenseIndex k=0; k<n; k++)
|
||||||
pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
|
pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
|
||||||
@ -1009,12 +1009,7 @@ EIGEN_ASM_COMMENT("mybegin4");
|
|||||||
for(Index j2=packet_cols; j2<cols; j2++)
|
for(Index j2=packet_cols; j2<cols; j2++)
|
||||||
{
|
{
|
||||||
// unpack B
|
// unpack B
|
||||||
{
|
|
||||||
traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
|
traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
|
||||||
// const RhsScalar* blB = &blockB[j2*strideB+offsetB];
|
|
||||||
// for(Index k=0; k<depth; k++)
|
|
||||||
// pstore(&unpackedB[k*RhsPacketSize], pset1<RhsPacket>(blB[k]));
|
|
||||||
}
|
|
||||||
|
|
||||||
for(Index i=0; i<peeled_mc; i+=mr)
|
for(Index i=0; i<peeled_mc; i+=mr)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user