mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Add vectorized integer division for int32 with AVX512, AVX or SSE.
This commit is contained in:
parent
5ffe7b92e0
commit
7b2901e2aa
@ -177,6 +177,10 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/Default/TypeCasting.h"
|
||||
#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
|
||||
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
#include <csignal>
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#if defined EIGEN_VECTORIZE_AVX512FP16
|
||||
#include "src/Core/arch/AVX512/PacketMathFP16.h"
|
||||
|
@ -212,6 +212,7 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
HasCmp = 1,
|
||||
HasDiv = 1,
|
||||
size=8
|
||||
};
|
||||
};
|
||||
@ -545,13 +546,19 @@ template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
|
||||
{ eigen_assert(false && "packet integer division are not supported by AVX");
|
||||
return pset1<Packet8i>(0);
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b)
|
||||
{
|
||||
#ifdef EIGEN_VECTORIZE_AVX512
|
||||
return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
|
||||
#else
|
||||
Packet4i lo = pdiv<Packet4i>(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
|
||||
Packet4i hi = pdiv<Packet4i>(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
|
||||
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
|
||||
return _mm256_fmadd_ps(a, b, c);
|
||||
@ -1226,7 +1233,12 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
|
||||
{
|
||||
return _mm256_movemask_ps(x)!=0;
|
||||
return _mm256_movemask_ps(x) != 0;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x)
|
||||
{
|
||||
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
|
@ -178,6 +178,7 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
HasCmp = 1,
|
||||
HasDiv = 1,
|
||||
size=16
|
||||
};
|
||||
};
|
||||
@ -389,12 +390,21 @@ EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
|
||||
const Packet16f& b) {
|
||||
return _mm512_div_ps(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
|
||||
const Packet8d& b) {
|
||||
return _mm512_div_pd(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a,
|
||||
const Packet16i& b) {
|
||||
Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b,0));
|
||||
Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
|
||||
return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
|
||||
@ -1378,7 +1388,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
|
||||
return !_mm512_kortestz(tmp,tmp);
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x)
|
||||
{
|
||||
__mmask16 tmp = _mm512_test_epi32_mask(x,x);
|
||||
return !_mm512_kortestz(tmp,tmp);
|
||||
}
|
||||
|
||||
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
|
||||
EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
|
||||
|
@ -195,6 +195,7 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
HasCmp = 1,
|
||||
HasDiv=1,
|
||||
size=4,
|
||||
|
||||
HasShift = 1,
|
||||
@ -369,6 +370,22 @@ template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, con
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a,
|
||||
const Packet4i& b) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX
|
||||
return _mm256_cvttpd_epi32(
|
||||
_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
|
||||
#else
|
||||
__m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
|
||||
__m128i q_hi =
|
||||
_mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)),
|
||||
_mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
|
||||
return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
@ -1142,6 +1159,11 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
|
||||
return _mm_movemask_ps(x) != 0x0;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
|
||||
{
|
||||
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet4f,4>& kernel) {
|
||||
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
|
||||
|
@ -376,6 +376,24 @@ struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
|
||||
};
|
||||
};
|
||||
|
||||
template <typename Packet, bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
|
||||
struct maybe_raise_div_by_zero {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
|
||||
EIGEN_UNUSED_VARIABLE(x);
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
template <typename Packet>
|
||||
struct maybe_raise_div_by_zero<Packet, true> {
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
|
||||
if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) {
|
||||
std::raise(SIGFPE);
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
/** \internal
|
||||
* \brief Template functor to compute the quotient of two scalars
|
||||
*
|
||||
@ -392,8 +410,10 @@ struct scalar_quotient_op : binary_op_base<LhsScalar,RhsScalar>
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
|
||||
template<typename Packet>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||
{ return internal::pdiv(a,b); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
|
||||
maybe_raise_div_by_zero<Packet>::run(b);
|
||||
return internal::pdiv(a,b);
|
||||
}
|
||||
};
|
||||
template<typename LhsScalar,typename RhsScalar>
|
||||
struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
|
||||
|
Loading…
x
Reference in New Issue
Block a user