mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-13 12:19:12 +08:00
Add partial vectorization for matrices and tensors of bool. This speeds up boolean operations on Tensors by up to 25x.
Benchmark numbers for the logical and of two NxN tensors: name old time/op new time/op delta BM_booleanAnd_1T/3 [using 1 threads] 14.6ns ± 0% 14.4ns ± 0% -0.96% BM_booleanAnd_1T/4 [using 1 threads] 20.5ns ±12% 9.0ns ± 0% -56.07% BM_booleanAnd_1T/7 [using 1 threads] 41.7ns ± 0% 10.5ns ± 0% -74.87% BM_booleanAnd_1T/8 [using 1 threads] 52.1ns ± 0% 10.1ns ± 0% -80.59% BM_booleanAnd_1T/10 [using 1 threads] 76.3ns ± 0% 13.8ns ± 0% -81.87% BM_booleanAnd_1T/15 [using 1 threads] 167ns ± 0% 16ns ± 0% -90.45% BM_booleanAnd_1T/16 [using 1 threads] 188ns ± 0% 16ns ± 0% -91.57% BM_booleanAnd_1T/31 [using 1 threads] 667ns ± 0% 34ns ± 0% -94.83% BM_booleanAnd_1T/32 [using 1 threads] 710ns ± 0% 35ns ± 0% -95.01% BM_booleanAnd_1T/64 [using 1 threads] 2.80µs ± 0% 0.11µs ± 0% -95.93% BM_booleanAnd_1T/128 [using 1 threads] 11.2µs ± 0% 0.4µs ± 0% -96.11% BM_booleanAnd_1T/256 [using 1 threads] 44.6µs ± 0% 2.5µs ± 0% -94.31% BM_booleanAnd_1T/512 [using 1 threads] 178µs ± 0% 10µs ± 0% -94.35% BM_booleanAnd_1T/1k [using 1 threads] 717µs ± 0% 78µs ± 1% -89.07% BM_booleanAnd_1T/2k [using 1 threads] 2.87ms ± 0% 0.31ms ± 1% -89.08% BM_booleanAnd_1T/4k [using 1 threads] 11.7ms ± 0% 1.9ms ± 4% -83.55% BM_booleanAnd_1T/10k [using 1 threads] 70.3ms ± 0% 17.2ms ± 4% -75.48%
This commit is contained in:
parent
00f6340153
commit
2f6ddaa25c
@ -224,36 +224,6 @@ pabs(const unsigned long long& a) { return a; }
|
|||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
parg(const Packet& a) { using numext::arg; return arg(a); }
|
parg(const Packet& a) { using numext::arg; return arg(a); }
|
||||||
|
|
||||||
/** \internal \returns the bitwise and of \a a and \a b */
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
pand(const Packet& a, const Packet& b) { return a & b; }
|
|
||||||
|
|
||||||
/** \internal \returns the bitwise or of \a a and \a b */
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
por(const Packet& a, const Packet& b) { return a | b; }
|
|
||||||
|
|
||||||
/** \internal \returns the bitwise xor of \a a and \a b */
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
pxor(const Packet& a, const Packet& b) { return a ^ b; }
|
|
||||||
|
|
||||||
/** \internal \returns the bitwise andnot of \a a and \a b */
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
pandnot(const Packet& a, const Packet& b) { return a & (~b); }
|
|
||||||
|
|
||||||
/** \internal \returns ones */
|
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
|
|
||||||
|
|
||||||
template <typename RealScalar>
|
|
||||||
EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
|
|
||||||
RealScalar b;
|
|
||||||
b = ptrue(b);
|
|
||||||
return std::complex<RealScalar>(b, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** \internal \returns the bitwise not of \a a */
|
|
||||||
template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
|
||||||
pnot(const Packet& a) { return pxor(ptrue(a), a);}
|
|
||||||
|
|
||||||
/** \internal \returns \a a logically shifted by N bits to the right */
|
/** \internal \returns \a a logically shifted by N bits to the right */
|
||||||
template<int N> EIGEN_DEVICE_FUNC inline int
|
template<int N> EIGEN_DEVICE_FUNC inline int
|
||||||
@ -294,6 +264,35 @@ pldexp(const Packet &a, const Packet &exponent) {
|
|||||||
return ldexp(a, static_cast<int>(exponent));
|
return ldexp(a, static_cast<int>(exponent));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Notice: The following ops accept and operator on bitwise masks.
|
||||||
|
// The value of each field in a masks is Scalar(0) or ~Scalar(0).
|
||||||
|
// For boolean packet like Packet16b, this is different from the
|
||||||
|
// representation of true and false, which are 1 and 0.
|
||||||
|
// As an example
|
||||||
|
// ptrue<Packet16b>() = 0xffffffffffffffffffffffffffffffff
|
||||||
|
// while
|
||||||
|
// pset1<Packet16b>(true) = 0x01010101010101010101010101010101
|
||||||
|
|
||||||
|
/** \internal \returns the bitwise and of \a a and \a b */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pand(const Packet& a, const Packet& b) { return a & b; }
|
||||||
|
|
||||||
|
/** \internal \returns the bitwise or of \a a and \a b */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
por(const Packet& a, const Packet& b) { return a | b; }
|
||||||
|
|
||||||
|
/** \internal \returns the bitwise xor of \a a and \a b */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pxor(const Packet& a, const Packet& b) { return a ^ b; }
|
||||||
|
|
||||||
|
/** \internal \returns the bitwise and of \a a and not \a b */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pandnot(const Packet& a, const Packet& b) { return a & (~b); }
|
||||||
|
|
||||||
|
/** \internal \returns ones */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
|
||||||
|
|
||||||
/** \internal \returns zeros */
|
/** \internal \returns zeros */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pzero(const Packet& a) { return pxor(a,a); }
|
pzero(const Packet& a) { return pxor(a,a); }
|
||||||
@ -308,21 +307,16 @@ template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
|
|||||||
return 0.;
|
return 0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** \internal \returns bits of \a or \b according to the input bit mask \a mask */
|
template <typename RealScalar>
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
|
||||||
pselect(const Packet& mask, const Packet& a, const Packet& b) {
|
RealScalar b;
|
||||||
return por(pand(a,mask),pandnot(b,mask));
|
b = ptrue(b);
|
||||||
|
return std::complex<RealScalar>(b, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
|
/** \internal \returns the bitwise not of \a a */
|
||||||
const float& mask, const float& a, const float&b) {
|
template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
return numext::equal_strict(mask,0.f) ? b : a;
|
pnot(const Packet& a) { return pxor(ptrue(a), a);}
|
||||||
}
|
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
|
|
||||||
const double& mask, const double& a, const double& b) {
|
|
||||||
return numext::equal_strict(mask,0.) ? b : a;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** \internal \returns a <= b as a bit mask */
|
/** \internal \returns a <= b as a bit mask */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
@ -340,6 +334,24 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
|
|||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
|
pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); }
|
||||||
|
|
||||||
|
/** \internal \returns \a or \b for each field in packet according to \mask */
|
||||||
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
|
pselect(const Packet& mask, const Packet& a, const Packet& b) {
|
||||||
|
return por(pand(a,mask),pandnot(b,mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
|
||||||
|
const float& cond, const float& a, const float&b) {
|
||||||
|
return numext::equal_strict(cond,0.f) ? b : a;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
|
||||||
|
const double& cond, const double& a, const double& b) {
|
||||||
|
return numext::equal_strict(cond,0.) ? b : a;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** \internal \returns the min of \a a and \a b (coeff-wise) */
|
/** \internal \returns the min of \a a and \a b (coeff-wise) */
|
||||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||||
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
|
pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
|
||||||
|
@ -44,10 +44,12 @@ typedef __m128d Packet2d;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
|
typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
|
||||||
|
typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
|
||||||
|
|
||||||
template<> struct is_arithmetic<__m128> { enum { value = true }; };
|
template<> struct is_arithmetic<__m128> { enum { value = true }; };
|
||||||
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
|
template<> struct is_arithmetic<__m128i> { enum { value = true }; };
|
||||||
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
|
template<> struct is_arithmetic<__m128d> { enum { value = true }; };
|
||||||
|
template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
|
||||||
|
|
||||||
#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
|
#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
|
||||||
|
|
||||||
@ -158,6 +160,30 @@ template<> struct packet_traits<int> : default_packet_traits
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<> struct packet_traits<bool> : default_packet_traits
|
||||||
|
{
|
||||||
|
typedef Packet16b type;
|
||||||
|
typedef Packet16b half;
|
||||||
|
enum {
|
||||||
|
Vectorizable = 1,
|
||||||
|
AlignedOnScalar = 1,
|
||||||
|
HasHalfPacket = 0,
|
||||||
|
size=16,
|
||||||
|
|
||||||
|
HasAdd = 0,
|
||||||
|
HasSub = 0,
|
||||||
|
HasShift = 0,
|
||||||
|
HasMul = 0,
|
||||||
|
HasNegate = 0,
|
||||||
|
HasAbs = 0,
|
||||||
|
HasAbs2 = 0,
|
||||||
|
HasMin = 0,
|
||||||
|
HasMax = 0,
|
||||||
|
HasConj = 0,
|
||||||
|
HasReduxp = 0
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<> struct unpacket_traits<Packet4f> {
|
template<> struct unpacket_traits<Packet4f> {
|
||||||
typedef float type;
|
typedef float type;
|
||||||
typedef Packet4f half;
|
typedef Packet4f half;
|
||||||
@ -174,6 +200,11 @@ template<> struct unpacket_traits<Packet4i> {
|
|||||||
typedef Packet4i half;
|
typedef Packet4i half;
|
||||||
enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
|
enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
|
||||||
};
|
};
|
||||||
|
template<> struct unpacket_traits<Packet16b> {
|
||||||
|
typedef bool type;
|
||||||
|
typedef Packet16b half;
|
||||||
|
enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
|
||||||
|
};
|
||||||
|
|
||||||
#ifndef EIGEN_VECTORIZE_AVX
|
#ifndef EIGEN_VECTORIZE_AVX
|
||||||
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
|
template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
|
||||||
@ -192,6 +223,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
|
|||||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
||||||
#endif
|
#endif
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) { return _mm_set1_epi8(static_cast<char>(from)); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
|
template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
|
||||||
|
|
||||||
@ -385,8 +417,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2
|
|||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
|
||||||
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
|
template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi32(a, a); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f
|
template<> EIGEN_STRONG_INLINE Packet4f
|
||||||
ptrue<Packet4f>(const Packet4f& a) {
|
ptrue<Packet4f>(const Packet4f& a) {
|
||||||
Packet4i b = _mm_castps_si128(a);
|
Packet4i b = _mm_castps_si128(a);
|
||||||
@ -398,17 +433,21 @@ ptrue<Packet2d>(const Packet2d& a) {
|
|||||||
return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
|
return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
|
template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
|
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
|
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
|
||||||
@ -471,6 +510,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
|
||||||
|
|
||||||
#if EIGEN_COMP_MSVC
|
#if EIGEN_COMP_MSVC
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
|
||||||
@ -505,6 +545,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
|||||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
||||||
}
|
}
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
|
||||||
|
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||||
|
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
|
||||||
{
|
{
|
||||||
@ -522,10 +567,12 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
|
|||||||
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
|
||||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
|
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
|
||||||
|
|
||||||
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
|
||||||
{
|
{
|
||||||
|
@ -382,11 +382,14 @@ struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
|
|||||||
struct scalar_boolean_and_op {
|
struct scalar_boolean_and_op {
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
|
||||||
|
template<typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||||
|
{ return internal::pand(a,b); }
|
||||||
};
|
};
|
||||||
template<> struct functor_traits<scalar_boolean_and_op> {
|
template<> struct functor_traits<scalar_boolean_and_op> {
|
||||||
enum {
|
enum {
|
||||||
Cost = NumTraits<bool>::AddCost,
|
Cost = NumTraits<bool>::AddCost,
|
||||||
PacketAccess = false
|
PacketAccess = true
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -398,11 +401,14 @@ template<> struct functor_traits<scalar_boolean_and_op> {
|
|||||||
struct scalar_boolean_or_op {
|
struct scalar_boolean_or_op {
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
|
||||||
|
template<typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||||
|
{ return internal::por(a,b); }
|
||||||
};
|
};
|
||||||
template<> struct functor_traits<scalar_boolean_or_op> {
|
template<> struct functor_traits<scalar_boolean_or_op> {
|
||||||
enum {
|
enum {
|
||||||
Cost = NumTraits<bool>::AddCost,
|
Cost = NumTraits<bool>::AddCost,
|
||||||
PacketAccess = false
|
PacketAccess = true
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -414,11 +420,14 @@ template<> struct functor_traits<scalar_boolean_or_op> {
|
|||||||
struct scalar_boolean_xor_op {
|
struct scalar_boolean_xor_op {
|
||||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
|
||||||
|
template<typename Packet>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
|
||||||
|
{ return internal::pxor(a,b); }
|
||||||
};
|
};
|
||||||
template<> struct functor_traits<scalar_boolean_xor_op> {
|
template<> struct functor_traits<scalar_boolean_xor_op> {
|
||||||
enum {
|
enum {
|
||||||
Cost = NumTraits<bool>::AddCost,
|
Cost = NumTraits<bool>::AddCost,
|
||||||
PacketAccess = false
|
PacketAccess = true
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -70,6 +70,23 @@ void test_cast() {
|
|||||||
test_cast_helper<FromScalar, FromPacket, ToScalar, ToPacket, CanCast>::run();
|
test_cast_helper<FromScalar, FromPacket, ToScalar, ToPacket, CanCast>::run();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Scalar,typename Packet> void packetmath_boolean()
|
||||||
|
{
|
||||||
|
const int PacketSize = internal::unpacket_traits<Packet>::size;
|
||||||
|
const int size = 2*PacketSize;
|
||||||
|
EIGEN_ALIGN_MAX Scalar data1[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar data2[size];
|
||||||
|
EIGEN_ALIGN_MAX Scalar ref[size];
|
||||||
|
|
||||||
|
for (int i=0; i<size; ++i)
|
||||||
|
{
|
||||||
|
data1[i] = internal::random<Scalar>();
|
||||||
|
}
|
||||||
|
CHECK_CWISE2_IF(true, internal::por, internal::por);
|
||||||
|
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
|
||||||
|
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Scalar,typename Packet> void packetmath()
|
template<typename Scalar,typename Packet> void packetmath()
|
||||||
{
|
{
|
||||||
typedef internal::packet_traits<Scalar> PacketTraits;
|
typedef internal::packet_traits<Scalar> PacketTraits;
|
||||||
@ -337,21 +354,6 @@ template<typename Scalar,typename Packet> void packetmath()
|
|||||||
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pinsertlast");
|
VERIFY(test::areApprox(ref, data2, PacketSize) && "internal::pinsertlast");
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
|
||||||
for (int i=0; i<PacketSize; ++i)
|
|
||||||
{
|
|
||||||
data1[i] = internal::random<Scalar>();
|
|
||||||
unsigned char v = internal::random<bool>() ? 0xff : 0;
|
|
||||||
char* bytes = (char*)(data1+PacketSize+i);
|
|
||||||
for(int k=0; k<int(sizeof(Scalar)); ++k) {
|
|
||||||
bytes[k] = v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CHECK_CWISE2_IF(true, internal::por, internal::por);
|
|
||||||
CHECK_CWISE2_IF(true, internal::pxor, internal::pxor);
|
|
||||||
CHECK_CWISE2_IF(true, internal::pand, internal::pand);
|
|
||||||
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
|
|
||||||
}
|
|
||||||
{
|
{
|
||||||
for (int i = 0; i < PacketSize; ++i) {
|
for (int i = 0; i < PacketSize; ++i) {
|
||||||
// "if" mask
|
// "if" mask
|
||||||
@ -377,8 +379,17 @@ template<typename Scalar,typename Packet> void packetmath()
|
|||||||
}
|
}
|
||||||
|
|
||||||
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
|
CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt);
|
||||||
|
|
||||||
|
for (int i=0; i<size; ++i)
|
||||||
|
{
|
||||||
|
data1[i] = internal::random<Scalar>();
|
||||||
|
}
|
||||||
|
CHECK_CWISE2_IF(true, internal::pandnot, internal::pandnot);
|
||||||
|
|
||||||
|
packetmath_boolean<Scalar, Packet>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<typename Scalar,typename Packet> void packetmath_real()
|
template<typename Scalar,typename Packet> void packetmath_real()
|
||||||
{
|
{
|
||||||
typedef internal::packet_traits<Scalar> PacketTraits;
|
typedef internal::packet_traits<Scalar> PacketTraits;
|
||||||
@ -807,6 +818,9 @@ EIGEN_DECLARE_TEST(packetmath)
|
|||||||
CALL_SUBTEST_11( test::runner<std::complex<float> >::run() );
|
CALL_SUBTEST_11( test::runner<std::complex<float> >::run() );
|
||||||
CALL_SUBTEST_12( test::runner<std::complex<double> >::run() );
|
CALL_SUBTEST_12( test::runner<std::complex<double> >::run() );
|
||||||
CALL_SUBTEST_13(( packetmath<half,internal::packet_traits<half>::type>() ));
|
CALL_SUBTEST_13(( packetmath<half,internal::packet_traits<half>::type>() ));
|
||||||
|
#ifdef EIGEN_PACKET_MATH_SSE_H
|
||||||
|
CALL_SUBTEST_14(( packetmath_boolean<bool,internal::packet_traits<bool>::type>() ));
|
||||||
|
#endif
|
||||||
g_first_pass = false;
|
g_first_pass = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
|
|
||||||
#include <Eigen/CXX11/Tensor>
|
#include <Eigen/CXX11/Tensor>
|
||||||
@ -193,8 +195,8 @@ static void test_constants()
|
|||||||
|
|
||||||
static void test_boolean()
|
static void test_boolean()
|
||||||
{
|
{
|
||||||
Tensor<int, 1> vec(6);
|
Tensor<int, 1> vec(31);
|
||||||
std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
|
std::iota(vec.data(), vec.data() + 31, 0);
|
||||||
|
|
||||||
// Test ||.
|
// Test ||.
|
||||||
Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
|
Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user