mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-19 08:07:36 +08:00
AVX512: implement faster ploadquad<Packet16f> thus speeding up GEMM
This commit is contained in:
parent
1c09ee8541
commit
cca6c207f4
@ -526,13 +526,11 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
|||||||
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
||||||
template <>
|
template <>
|
||||||
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
|
||||||
Packet16f tmp = _mm512_undefined_ps();
|
Packet16f tmp = _mm512_castps128_ps512(pload<Packet4f>(from));
|
||||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
|
const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
|
||||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
|
return _mm512_permutexvar_ps(scatter_mask, tmp);
|
||||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
|
|
||||||
tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
|
|
||||||
return tmp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads 2 doubles from memory a returns the packet
|
// Loads 2 doubles from memory a returns the packet
|
||||||
// {a0, a0 a0, a0, a1, a1, a1, a1}
|
// {a0, a0 a0, a0, a1, a1, a1, a1}
|
||||||
template <>
|
template <>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user