mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 20:56:00 +08:00
optimized conjugate products for SSE3
This commit is contained in:
parent
65257f6b29
commit
a2415388ef
@ -147,10 +147,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, false,true>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pmul(a, ei_pconj(b));
|
||||||
|
#else
|
||||||
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
||||||
return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
|
return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
|
||||||
_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
||||||
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -161,10 +165,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,false>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pmul(ei_pconj(a), b);
|
||||||
|
#else
|
||||||
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
||||||
return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
|
||||||
_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
||||||
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
|
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -175,10 +183,14 @@ template<> struct ei_conj_helper<Packet2cf, Packet2cf, true,true>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pconj(ei_pmul(a, b));
|
||||||
|
#else
|
||||||
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
|
||||||
return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
|
return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
|
||||||
_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3),
|
||||||
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
ei_vec4f_swizzle1(b.v, 1, 0, 3, 2))));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -300,10 +312,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, false,true>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pmul(a, ei_pconj(b));
|
||||||
|
#else
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
||||||
return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
|
return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
|
||||||
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||||
ei_vec2d_swizzle1(b.v, 1, 0))));
|
ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -314,10 +330,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,false>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pmul(ei_pconj(a), b);
|
||||||
|
#else
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
||||||
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
return Packet1cd(_mm_add_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v),
|
||||||
_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||||
ei_vec2d_swizzle1(b.v, 1, 0)), mask)));
|
ei_vec2d_swizzle1(b.v, 1, 0)), mask)));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -328,10 +348,14 @@ template<> struct ei_conj_helper<Packet1cd, Packet1cd, true,true>
|
|||||||
|
|
||||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE3
|
||||||
|
return ei_pconj(ei_pmul(a, b));
|
||||||
|
#else
|
||||||
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
|
||||||
return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
|
return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(ei_vec2d_swizzle1(a.v, 0, 0), b.v), mask),
|
||||||
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
_mm_mul_pd(ei_vec2d_swizzle1(a.v, 1, 1),
|
||||||
ei_vec2d_swizzle1(b.v, 1, 0))));
|
ei_vec2d_swizzle1(b.v, 1, 0))));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -73,9 +73,6 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
ei_const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||||
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
ei_const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||||
|
|
||||||
if (ConjugateRhs)
|
|
||||||
alpha = ei_conj(alpha);
|
|
||||||
|
|
||||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||||
typedef ei_product_blocking_traits<Scalar> Blocking;
|
typedef ei_product_blocking_traits<Scalar> Blocking;
|
||||||
|
|
||||||
@ -83,9 +80,18 @@ static void run(Index rows, Index cols, Index depth,
|
|||||||
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
|
Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction
|
||||||
//Index nc = blocking.nc(); // cache block size along the N direction
|
//Index nc = blocking.nc(); // cache block size along the N direction
|
||||||
|
|
||||||
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
|
// FIXME starting from SSE3, normal complex product cannot be optimized as well as
|
||||||
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
|
// conjugate product, therefore it is better to conjugate during the copies.
|
||||||
ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
|
// With SSE2, this is the other way round.
|
||||||
|
ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder, ConjugateLhs> pack_lhs;
|
||||||
|
ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder, ConjugateRhs> pack_rhs;
|
||||||
|
ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr> gebp;
|
||||||
|
|
||||||
|
// if (ConjugateRhs)
|
||||||
|
// alpha = ei_conj(alpha);
|
||||||
|
// ei_gemm_pack_lhs<Scalar, Index, Blocking::mr, LhsStorageOrder> pack_lhs;
|
||||||
|
// ei_gemm_pack_rhs<Scalar, Index, Blocking::nr, RhsStorageOrder> pack_rhs;
|
||||||
|
// ei_gebp_kernel<Scalar, Index, Blocking::mr, Blocking::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||||
|
|
||||||
#ifdef EIGEN_HAS_OPENMP
|
#ifdef EIGEN_HAS_OPENMP
|
||||||
if(info)
|
if(info)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user