mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
Created the ptranspose packet primitive that can transpose an array of N packets, where N is the number of words in each packet. This primitive will be used to complete the vectorization of the gemm_pack_lhs and gemm_pack_rhs functions.
Implemented the primitive using SSE instructions.
This commit is contained in:
parent
14bc4b9704
commit
a419cea4a0
@ -386,9 +386,22 @@ template<> inline std::complex<double> pmul(const std::complex<double>& a, const
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************************************
|
||||||
|
* Kernel, that is a collection of N packets where N is the number of words
|
||||||
|
* in the packet.
|
||||||
|
***************************************************************************/
|
||||||
|
template <typename Packet> struct Kernel {
|
||||||
|
Packet packet[unpacket_traits<Packet>::size];
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet>& /*kernel*/) {
|
||||||
|
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_GENERIC_PACKET_MATH_H
|
#endif // EIGEN_GENERIC_PACKET_MATH_H
|
||||||
|
|
||||||
|
@ -435,6 +435,16 @@ EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
|
|||||||
return Packet1cd(preverse(x.v));
|
return Packet1cd(preverse(x.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet2cf>& kernel) {
|
||||||
|
__m128d w1 = _mm_castps_pd(kernel.packet[0].v);
|
||||||
|
__m128d w2 = _mm_castps_pd(kernel.packet[1].v);
|
||||||
|
|
||||||
|
__m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
|
||||||
|
kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
|
||||||
|
kernel.packet[1].v = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -707,6 +707,31 @@ struct palign_impl<Offset,Packet2d>
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet4f>& kernel) {
|
||||||
|
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet2d>& kernel) {
|
||||||
|
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
|
||||||
|
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
|
||||||
|
kernel.packet[1] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet4i>& kernel) {
|
||||||
|
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
__m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
|
||||||
|
kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
|
||||||
|
kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
|
||||||
|
kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
|
||||||
|
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -208,6 +208,18 @@ template<typename Scalar> void packetmath()
|
|||||||
ref[i] = data1[PacketSize-i-1];
|
ref[i] = data1[PacketSize-i-1];
|
||||||
internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
|
internal::pstore(data2, internal::preverse(internal::pload<Packet>(data1)));
|
||||||
VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse");
|
VERIFY(areApprox(ref, data2, PacketSize) && "internal::preverse");
|
||||||
|
|
||||||
|
internal::Kernel<Packet> kernel;
|
||||||
|
for (int i=0; i<PacketSize; ++i) {
|
||||||
|
kernel.packet[i] = internal::pload<Packet>(data1+i*PacketSize);
|
||||||
|
}
|
||||||
|
ptranspose(kernel);
|
||||||
|
for (int i=0; i<PacketSize; ++i) {
|
||||||
|
internal::pstore(data2, kernel.packet[i]);
|
||||||
|
for (int j = 0; j < PacketSize; ++j) {
|
||||||
|
VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Scalar> void packetmath_real()
|
template<typename Scalar> void packetmath_real()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user