more pblend optimizations

This commit is contained in:
Charles Schlosser 2024-04-19 02:02:27 +00:00 committed by Rasmus Munk Larsen
parent f0795d35e3
commit 5635d37f46
2 changed files with 25 additions and 28 deletions

View File

@ -2134,35 +2134,28 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
0 - ifPacket.select[0]);
}
EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
0 - ifPacket.select[1], 0 - ifPacket.select[0]);
}
template <>
EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
const Packet8f& elsePacket) {
#ifdef EIGEN_VECTORIZE_AVX2
const __m256i select =
_mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
const __m256 true_mask = _mm256_castsi256_ps(_mm256_sub_epi32(_mm256_setzero_si256(), select));
#else
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
const __m256 true_mask = _mm256_cmp_ps(select, _mm256_setzero_ps(), _CMP_NEQ_UQ);
#endif
const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
const Packet4d& elsePacket) {
#ifdef EIGEN_VECTORIZE_AVX2
const __m256i select =
_mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
const __m256d true_mask = _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), select));
#else
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256d true_mask = _mm256_cmp_pd(select, _mm256_setzero_pd(), _CMP_NEQ_UQ);
#endif
const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
}

View File

@ -2232,18 +2232,24 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
}
EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
}
EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
}
template <>
EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
const Packet2l& elsePacket) {
const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select);
const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
const Packet4i& elsePacket) {
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select);
const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
}
template <>
@ -2254,15 +2260,13 @@ EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4u
template <>
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
const Packet4f& elsePacket) {
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
const __m128i true_mask = _mm_sub_epi32(_mm_setzero_si128(), select);
const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
}
template <>
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
const Packet2d& elsePacket) {
const __m128i select = _mm_set_epi64x(ifPacket.select[1], ifPacket.select[0]);
const __m128i true_mask = _mm_sub_epi64(_mm_setzero_si128(), select);
const __m128i true_mask = sse_blend_mask(ifPacket);
return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
}