AVX512 and AVX2 support for Packet16i and Packet8i added

This commit is contained in:
Jakub Lichman 2021-08-25 19:38:23 +00:00 committed by Rasmus Munk Larsen
parent ab28419298
commit dc5b1f7d75
4 changed files with 439 additions and 22 deletions

View File

@ -196,23 +196,21 @@ struct packet_traits<bfloat16> : default_packet_traits {
HasNdtri = 1
};
};
#endif
template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
use SSE instructions and packets to deal with integers.
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet8i type;
typedef Packet4i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
#endif
template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
template<> struct unpacket_traits<Packet8f> {
typedef float type;
@ -226,8 +224,16 @@ template<> struct unpacket_traits<Packet4d> {
typedef Packet2d half;
enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
};
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
template<> struct unpacket_traits<Packet8bf> { typedef bfloat16 type; typedef Packet8bf half; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; };
template<> struct unpacket_traits<Packet8i> {
typedef int type;
typedef Packet4i half;
enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
};
template<> struct unpacket_traits<Packet8bf> {
typedef bfloat16 type;
typedef Packet8bf half;
enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
};
// Helper function for bit packing snippet of low precision comparison.
// It packs the flags from 16x16 to 8x16.
@ -258,6 +264,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { r
template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) { return _mm256_add_epi32(_mm256_set1_epi32(a), _mm256_set_epi32(7,6,5,4,3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
@ -291,6 +298,10 @@ template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
{
return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a)
{
return _mm256_sub_epi32(_mm256_set1_epi32(0), a);
}
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
@ -352,7 +363,26 @@ template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4
template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
template<> EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_xor_si256(_mm256_cmpgt_epi32(a,b), _mm256_set1_epi32(-1));
#else
__m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
__m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_cmpgt_epi32(b,a);
#else
__m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
__m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_cmpeq_epi32(a,b);
@ -388,6 +418,9 @@ template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const
return _mm256_min_pd(b,a);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
return _mm256_min_epi32(a, b);
}
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
@ -411,6 +444,9 @@ template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const
return _mm256_max_pd(b,a);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
return _mm256_max_epi32(a, b);
}
// Add specializations for min/max with prescribed NaN progation.
template<>
@ -611,6 +647,12 @@ template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
return _mm256_permute_pd(tmp, 3<<2);
}
// Loads 4 integers from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
template<> EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from)
{
Packet8i a = _mm256_castsi128_si256(pload<Packet4i>(from));
return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
@ -618,6 +660,10 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
}
template<> EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from)
{
return _mm256_inserti128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
@ -646,6 +692,11 @@ template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const dou
{
return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride)
{
return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
{
@ -670,6 +721,20 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to,
to[stride*2] = _mm_cvtsd_f64(high);
to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride)
{
__m128i low = _mm256_extracti128_si256(from, 0);
to[stride*0] = _mm_extract_epi32(low, 0);
to[stride*1] = _mm_extract_epi32(low, 1);
to[stride*2] = _mm_extract_epi32(low, 2);
to[stride*3] = _mm_extract_epi32(low, 3);
__m128i high = _mm256_extracti128_si256(from, 1);
to[stride*4] = _mm_extract_epi32(high, 0);
to[stride*5] = _mm_extract_epi32(high, 1);
to[stride*6] = _mm_extract_epi32(high, 2);
to[stride*7] = _mm_extract_epi32(high, 3);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
{
@ -720,6 +785,10 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
return _mm256_permute_pd(swap_halves,5);
#endif
}
template<> EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a)
{
return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
}
// pabs should be ok
template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
@ -732,6 +801,10 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
return _mm256_and_pd(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a)
{
return _mm256_abs_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
return pfrexp_generic(a,exponent);
@ -803,11 +876,19 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
{
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
}
template<> EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a)
{
return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1))));
}
template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}
template<> EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a)
{
return _mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1));
}
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
{
@ -905,6 +986,55 @@ ptranspose(PacketBlock<Packet8f,4>& kernel) {
kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
}
#define MM256_SHUFFLE_EPI32(A, B, M) \
_mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8i,8>& kernel) {
__m256i T0 = _mm256_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m256i T1 = _mm256_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
__m256i T2 = _mm256_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m256i T3 = _mm256_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
__m256i T4 = _mm256_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
__m256i T5 = _mm256_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
__m256i T6 = _mm256_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
__m256i T7 = _mm256_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
__m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
__m256i S4 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(1,0,1,0));
__m256i S5 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(3,2,3,2));
__m256i S6 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(1,0,1,0));
__m256i S7 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet8i,4>& kernel) {
__m256i T0 = _mm256_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m256i T1 = _mm256_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
__m256i T2 = _mm256_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m256i T3 = _mm256_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
__m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
__m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
__m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
__m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4d,4>& kernel) {
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);

View File

@ -153,17 +153,16 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
/* TODO Implement AVX512 for integers
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet16i type;
typedef Packet8i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
size=16
};
};
*/
template <>
struct unpacket_traits<Packet16f> {
@ -183,7 +182,7 @@ template <>
struct unpacket_traits<Packet16i> {
typedef int type;
typedef Packet8i half;
enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
};
template<>
@ -254,6 +253,12 @@ EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
return _mm512_add_pd(_mm512_set1_pd(a),
_mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
}
template <>
EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
return _mm512_add_epi32(
_mm512_set1_epi32(a),
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
}
template <>
EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
@ -295,6 +300,10 @@ template <>
EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
}
template <>
EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
return _mm512_sub_epi32(_mm512_set1_epi32(0), a);
}
template <>
EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
@ -379,6 +388,11 @@ EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
// Arguments are reversed to match NaN propagation behavior of std::min.
return _mm512_min_pd(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_min_epi32(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
@ -392,6 +406,11 @@ EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
// Arguments are reversed to match NaN propagation behavior of std::max.
return _mm512_max_pd(b, a);
}
template <>
EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a,
const Packet16i& b) {
return _mm512_max_epi32(b, a);
}
// Add specializations for min/max with prescribed NaN progation.
template<>
@ -493,10 +512,17 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, cons
}
template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
}
template<> EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
}
template<> EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
}
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
@ -746,6 +772,16 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
}
#endif
// Loads 8 integers from memory and returns the packet
// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
template <>
EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
__m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
__m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
return _mm512_castps_si512(pairs);
}
// Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template <>
@ -766,6 +802,15 @@ EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
return _mm512_insertf64x4(tmp, lane1, 1);
}
// Loads 4 integers from memory and returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
template <>
EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
return _mm512_permutexvar_epi32(scatter_mask, tmp);
}
template <>
EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
@ -818,6 +863,15 @@ EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
return _mm512_i32gather_pd(indices, from, 8);
}
template <>
EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
return _mm512_i32gather_epi32(indices, from, 4);
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
@ -838,6 +892,16 @@ EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8);
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to,
const Packet16i& from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_epi32(to, indices, from, 4);
}
template <>
EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
@ -882,6 +946,11 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
}
template<> EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a)
{
return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
}
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
{
// _mm512_abs_ps intrinsic not found, so hack around it
@ -893,6 +962,10 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
_mm512_set1_epi64(0x7fffffffffffffff)));
}
template<> EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a)
{
return _mm512_abs_epi32(a);
}
template<>
EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
@ -952,6 +1025,11 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
// AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
__m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0) \
__m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
#else
#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
__m256 OUTPUT##_0 = _mm256_insertf128_ps( \
@ -960,11 +1038,22 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
__m256 OUTPUT##_1 = _mm256_insertf128_ps( \
_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
_mm512_extractf32x4_ps(INPUT, 3), 1);
#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
__m256i OUTPUT##_0 = _mm256_insertf128_si256( \
_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
_mm512_extracti32x4_epi32(INPUT, 1), 1); \
__m256i OUTPUT##_1 = _mm256_insertf128_si256( \
_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
_mm512_extracti32x4_epi32(INPUT, 3), 1);
#endif
#ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1);
#else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_undefined_ps(); \
@ -972,6 +1061,13 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_undefined_epi32(); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
#endif
template <>
@ -1000,6 +1096,24 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
__m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
}
template <>
EIGEN_STRONG_INLINE int predux<Packet16i>(const Packet16i& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
__m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
Packet8i x = _mm256_add_epi32(lane0, lane1);
return predux<Packet8i>(x);
#else
__m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
__m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
__m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
__m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
__m128i sum = _mm_add_epi32(_mm_add_epi32(lane0, lane1), _mm_add_epi32(lane2, lane3));
sum = _mm_hadd_epi32(sum, sum);
sum = _mm_hadd_epi32(sum, _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(sum), 1)));
return _mm_cvtsi128_si32(sum);
#endif
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
@ -1023,6 +1137,22 @@ EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
return _mm256_add_pd(lane0, lane1);
}
template <>
EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
__m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
return _mm256_add_epi32(lane0, lane1);
#else
__m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
__m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
__m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
__m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
__m128i sum0 = _mm_add_epi32(lane0, lane2);
__m128i sum1 = _mm_add_epi32(lane1, lane3);
return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
#endif
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
@ -1352,6 +1482,163 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8);
PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8);
}
#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], \
INPUT[2 * INDEX + STRIDE]);
#define SHUFFLE_EPI32(A, B, M) \
_mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
__m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
__m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
__m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
__m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
__m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
__m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
__m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]);
__m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]);
__m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]);
__m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]);
__m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]);
__m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]);
__m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]);
__m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]);
__m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
EIGEN_EXTRACT_8i_FROM_16i(S4, S4);
EIGEN_EXTRACT_8i_FROM_16i(S5, S5);
EIGEN_EXTRACT_8i_FROM_16i(S6, S6);
EIGEN_EXTRACT_8i_FROM_16i(S7, S7);
EIGEN_EXTRACT_8i_FROM_16i(S8, S8);
EIGEN_EXTRACT_8i_FROM_16i(S9, S9);
EIGEN_EXTRACT_8i_FROM_16i(S10, S10);
EIGEN_EXTRACT_8i_FROM_16i(S11, S11);
EIGEN_EXTRACT_8i_FROM_16i(S12, S12);
EIGEN_EXTRACT_8i_FROM_16i(S13, S13);
EIGEN_EXTRACT_8i_FROM_16i(S14, S14);
EIGEN_EXTRACT_8i_FROM_16i(S15, S15);
PacketBlock<Packet8i, 32> tmp;
tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20);
tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20);
tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20);
tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20);
tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31);
tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31);
tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31);
tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31);
tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20);
tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20);
tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20);
tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20);
tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31);
tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31);
tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31);
tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31);
// Second set of _m256 outputs
tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20);
tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20);
tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20);
tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20);
tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31);
tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31);
tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31);
tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31);
tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20);
tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20);
tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20);
tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20);
tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31);
tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31);
tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31);
tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31);
// Pack them into the output
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16);
PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
__m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
__m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
__m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
__m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
__m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
PacketBlock<Packet8i, 8> tmp;
tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20);
tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20);
tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31);
tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31);
tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20);
tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20);
tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31);
tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31);
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1);
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1);
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1);
PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
}
template <>
EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
const Packet16f& /*thenPacket*/,

View File

@ -180,7 +180,6 @@ struct packet_traits<double> : default_packet_traits {
HasRint = 1
};
};
#endif
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet4i type;
@ -194,7 +193,7 @@ template<> struct packet_traits<int> : default_packet_traits
HasBlend = 1
};
};
#endif
template<> struct packet_traits<bool> : default_packet_traits
{
typedef Packet16b type;
@ -233,7 +232,7 @@ template<> struct unpacket_traits<Packet2d> {
template<> struct unpacket_traits<Packet4i> {
typedef int type;
typedef Packet4i half;
enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
};
template<> struct unpacket_traits<Packet16b> {
typedef bool type;

View File

@ -337,7 +337,8 @@ struct vectorization_logic_half
((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,
PacketTraits::HasDiv ? CompleteUnrolling : NoUnrolling));
VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal),