diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 3f7a04b7d..fcc5219f5 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -79,9 +79,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ei_pmul(const Packet2cf& a, { // TODO optimize it for SSE3 and 4 const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xa0)), b.v), - _mm_xor_ps(_mm_mul_ps(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a.v), 0xf5)), - _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b.v), 0xb1 ))), mask))); + return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); } // template<> EIGEN_STRONG_INLINE Packet2cf ei_pmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) @@ -157,4 +157,46 @@ struct ei_palign_impl } }; +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + } +}; + +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + } +}; + +template<> struct ei_conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return ei_padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(ei_vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(ei_vec4f_swizzle1(a.v, 1, 1, 3, 3), + ei_vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + } +}; + #endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 8a3cdf679..141754d98 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -58,6 +58,7 @@ template<> struct ei_is_arithmetic<__m128d> { enum { ret = true }; }; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i ei_p4i_##NAME = ei_pset1(X) + template<> struct ei_packet_traits : ei_default_packet_traits { typedef Packet4f type; enum {size=4}; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 4a2ebb713..cf133f68f 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -134,13 +134,13 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st } #ifdef EIGEN_HAS_FUSE_CJMADD - #define CJMADD(A,B,C,T) C = cj.pmadd(A,B,C); + #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else - #define CJMADD(A,B,C,T) T = B; T = cj.pmul(A,T); C = ei_padd(C,T); + #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = ei_padd(C,T); #endif // optimized GEneral packed Block * packed Panel product kernel -template +template struct ei_gebp_kernel { void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, @@ -150,7 +150,8 @@ struct ei_gebp_kernel enum { PacketSize = ei_packet_traits::size }; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; - Conj cj; + ei_conj_helper cj; + ei_conj_helper pcj; Index packet_cols = (cols/nr) * nr; const Index peeled_mc = (rows/mr)*mr; const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= PacketSize ? PacketSize : 0); @@ -263,38 +264,38 @@ EIGEN_ASM_COMMENT("mybegin"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[2*PacketSize]); A1 = ei_pload(&blA[3*PacketSize]); B0 = ei_pload(&blB[2*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[4*PacketSize]); A1 = ei_pload(&blA[5*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); A0 = ei_pload(&blA[6*PacketSize]); A1 = ei_pload(&blA[7*PacketSize]); B0 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); EIGEN_ASM_COMMENT("myend"); } else @@ -309,59 +310,59 @@ EIGEN_ASM_COMMENT("mybegin"); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A1,B0,C4,B0); B3 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[2*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[3*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[8*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[9*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[10*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[4*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[5*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[12*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); B1 = ei_pload(&blB[13*PacketSize]); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); B2 = ei_pload(&blB[14*PacketSize]); - CJMADD(A0,B3,C3,T0); + MADD(pcj,A0,B3,C3,T0); A0 = ei_pload(&blA[6*PacketSize]); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A1,B3,C7,B3); A1 = ei_pload(&blA[7*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); - CJMADD(A0,B3,C3,T0); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); + MADD(pcj,A0,B3,C3,T0); + MADD(pcj,A1,B3,C7,B3); EIGEN_ASM_COMMENT("myend"); } @@ -381,11 +382,11 @@ EIGEN_ASM_COMMENT("myend"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,B0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); - CJMADD(A1,B0,C5,B0); + MADD(pcj,A0,B0,C1,T0); + MADD(pcj,A1,B0,C5,B0); } else { @@ -399,16 +400,16 @@ EIGEN_ASM_COMMENT("myend"); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B2 = ei_pload(&blB[2*PacketSize]); - CJMADD(A1,B0,C4,B0); + MADD(pcj,A1,B0,C4,B0); B3 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B1,C1,T0); - CJMADD(A1,B1,C5,B1); - CJMADD(A0,B2,C2,T0); - CJMADD(A1,B2,C6,B2); - CJMADD(A0,B3,C3,T0); - CJMADD(A1,B3,C7,B3); + MADD(pcj,A0,B1,C1,T0); + MADD(pcj,A1,B1,C5,B1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A1,B2,C6,B2); + MADD(pcj,A0,B3,C3,T0); + MADD(pcj,A1,B3,C7,B3); } blB += nr*PacketSize; @@ -468,23 +469,23 @@ EIGEN_ASM_COMMENT("myend"); A0 = ei_pload(&blA[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[2*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[1*PacketSize]); B1 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[2*PacketSize]); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); A0 = ei_pload(&blA[3*PacketSize]); B1 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,B0); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); } else { @@ -494,41 +495,41 @@ EIGEN_ASM_COMMENT("myend"); B0 = ei_pload(&blB[0*PacketSize]); B1 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B2 = ei_pload(&blB[2*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]); B0 = ei_pload(&blB[4*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[5*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[6*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[1*PacketSize]); B3 = ei_pload(&blB[7*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[8*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[9*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[10*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[2*PacketSize]); B3 = ei_pload(&blB[11*PacketSize]); - CJMADD(A0,B0,C0,B0); + MADD(pcj,A0,B0,C0,B0); B0 = ei_pload(&blB[12*PacketSize]); - CJMADD(A0,B1,C1,B1); + MADD(pcj,A0,B1,C1,B1); B1 = ei_pload(&blB[13*PacketSize]); - CJMADD(A0,B2,C2,B2); + MADD(pcj,A0,B2,C2,B2); B2 = ei_pload(&blB[14*PacketSize]); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B3,C3,B3); A0 = ei_pload(&blA[3*PacketSize]); B3 = ei_pload(&blB[15*PacketSize]); - CJMADD(A0,B0,C0,B0); - CJMADD(A0,B1,C1,B1); - CJMADD(A0,B2,C2,B2); - CJMADD(A0,B3,C3,B3); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); + MADD(pcj,A0,B2,C2,B2); + MADD(pcj,A0,B3,C3,B3); } blB += 4*nr*PacketSize; @@ -546,9 +547,9 @@ EIGEN_ASM_COMMENT("myend"); A0 = ei_pload(&blA[0*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); + MADD(pcj,A0,B0,C0,T0); B0 = ei_pload(&blB[1*PacketSize]); - CJMADD(A0,B0,C1,T0); + MADD(pcj,A0,B0,C1,T0); } else { @@ -563,10 +564,10 @@ EIGEN_ASM_COMMENT("myend"); B2 = ei_pload(&blB[2*PacketSize]); B3 = ei_pload(&blB[3*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - CJMADD(A0,B2,C2,T0); - CJMADD(A0,B3,C3,T1); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A0,B1,C1,T1); + MADD(pcj,A0,B2,C2,T0); + MADD(pcj,A0,B3,C3,T1); } blB += nr*PacketSize; @@ -598,9 +599,9 @@ EIGEN_ASM_COMMENT("myend"); A0 = blA[k]; B0 = blB[0*PacketSize]; - CJMADD(A0,B0,C0,T0); + MADD(cj,A0,B0,C0,T0); B0 = blB[1*PacketSize]; - CJMADD(A0,B0,C1,T0); + MADD(cj,A0,B0,C1,T0); } else { @@ -615,10 +616,10 @@ EIGEN_ASM_COMMENT("myend"); B2 = blB[2*PacketSize]; B3 = blB[3*PacketSize]; - CJMADD(A0,B0,C0,T0); - CJMADD(A0,B1,C1,T1); - CJMADD(A0,B2,C2,T0); - CJMADD(A0,B3,C3,T1); + MADD(cj,A0,B0,C0,T0); + MADD(cj,A0,B1,C1,T1); + MADD(cj,A0,B2,C2,T0); + MADD(cj,A0,B3,C3,T1); } blB += nr*PacketSize; @@ -664,8 +665,8 @@ EIGEN_ASM_COMMENT("myend"); A0 = ei_pload(&blA[0*PacketSize]); A1 = ei_pload(&blA[1*PacketSize]); B0 = ei_pload(&blB[0*PacketSize]); - CJMADD(A0,B0,C0,T0); - CJMADD(A1,B0,C4,T1); + MADD(pcj,A0,B0,C0,T0); + MADD(pcj,A1,B0,C4,T1); blB += PacketSize; blA += mr; @@ -686,8 +687,7 @@ EIGEN_ASM_COMMENT("myend"); for(Index k=0; k cj; - if(ConjugateRhs) - alpha = ei_conj(alpha); + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A0)(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs1[j]), ptmp1)), \ + ei_padd(pcj.pmul(EIGEN_CAT(ei_ploa , A2)(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ei_ploa , A13)(&lhs3[j]), ptmp3)) ))) typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + ei_conj_helper cj; + ei_conj_helper pcj; + if(ConjugateRhs) + alpha = ei_conj(alpha); + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; const Index columnsAtOnce = 4; const Index peels = 2; @@ -168,19 +169,19 @@ void ei_cache_friendly_product_colmajor_times_vector( A00 = ei_pload (&lhs0[j]); A10 = ei_pload (&lhs0[j+PacketSize]); - A00 = cj.pmadd(A00, ptmp0, ei_pload(&res[j])); - A10 = cj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); + A00 = pcj.pmadd(A00, ptmp0, ei_pload(&res[j])); + A10 = pcj.pmadd(A10, ptmp0, ei_pload(&res[j+PacketSize])); - A00 = cj.pmadd(A01, ptmp1, A00); + A00 = pcj.pmadd(A01, ptmp1, A00); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); - A00 = cj.pmadd(A02, ptmp2, A00); + A00 = pcj.pmadd(A02, ptmp2, A00); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); - A00 = cj.pmadd(A03, ptmp3, A00); + A00 = pcj.pmadd(A03, ptmp3, A00); ei_pstore(&res[j],A00); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); - A10 = cj.pmadd(A11, ptmp1, A10); - A10 = cj.pmadd(A12, ptmp2, A10); - A10 = cj.pmadd(A13, ptmp3, A10); + A10 = pcj.pmadd(A11, ptmp1, A10); + A10 = pcj.pmadd(A12, ptmp2, A10); + A10 = pcj.pmadd(A13, ptmp3, A10); ei_pstore(&res[j+PacketSize],A10); } } @@ -225,10 +226,10 @@ void ei_cache_friendly_product_colmajor_times_vector( // process aligned result's coeffs if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) for (Index j = alignedStart;j cj; + ptmp0 = pcj.pmadd(EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), b, ptmp3); } typedef typename NumTraits::Real RealScalar; typedef typename ei_packet_traits::type Packet; const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + ei_conj_helper cj; + ei_conj_helper pcj; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; @@ -386,19 +388,19 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); - ptmp0 = cj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); - ptmp1 = cj.pmadd(A01, b, ptmp1); + ptmp0 = pcj.pmadd(ei_pload (&lhs0[j]), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); - ptmp2 = cj.pmadd(A02, b, ptmp2); + ptmp2 = pcj.pmadd(A02, b, ptmp2); A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); - ptmp3 = cj.pmadd(A03, b, ptmp3); + ptmp3 = pcj.pmadd(A03, b, ptmp3); A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); b = ei_pload(&rhs[j+PacketSize]); - ptmp0 = cj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); - ptmp1 = cj.pmadd(A11, b, ptmp1); - ptmp2 = cj.pmadd(A12, b, ptmp2); - ptmp3 = cj.pmadd(A13, b, ptmp3); + ptmp0 = pcj.pmadd(ei_pload (&lhs0[j+PacketSize]), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (Index j = peeledSize; j > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_symm_pack_lhs pack_lhs; ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs_transposed; @@ -353,7 +353,7 @@ struct ei_product_selfadjoint_matrix > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_lhs pack_lhs; ei_symm_pack_rhs pack_rhs; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index d6933adb6..4514c7692 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -46,8 +46,11 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( FirstTriangular = IsRowMajor == IsLower }; - ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; - ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + ei_conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? ei_conj(alpha) : alpha; @@ -121,9 +124,9 @@ static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector( Packet Bi = ei_ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases Packet Xi = ei_pload (resIt); - Xi = cj0.pmadd(A0i,ptmp0, cj0.pmadd(A1i,ptmp1,Xi)); - ptmp2 = cj1.pmadd(A0i, Bi, ptmp2); - ptmp3 = cj1.pmadd(A1i, Bi, ptmp3); + Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); ei_pstore(resIt,Xi); resIt += PacketSize; } for (size_t i=alignedEnd; i +template struct ei_sybb_kernel; /* Optimized selfadjoint product (_SYRK) */ @@ -84,12 +84,15 @@ struct ei_selfadjoint_product::IsComplex && !AAT, NumTraits::IsComplex && AAT> Conj; + enum { + ConjLhs = NumTraits::IsComplex && !AAT, + ConjRhs = NumTraits::IsComplex && AAT + }; - ei_gebp_kernel gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_rhs pack_rhs; ei_gemm_pack_lhs pack_lhs; - ei_sybb_kernel sybb; + ei_sybb_kernel sybb; for(Index k2=0; k2& SelfAdjointView // while the selfadjoint block overlapping the diagonal is evaluated into a // small temporary buffer which is then accumulated into the result using a // triangular traversal. -template +template struct ei_sybb_kernel { enum { @@ -172,7 +175,7 @@ struct ei_sybb_kernel }; void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace) { - ei_gebp_kernel gebp_kernel; + ei_gebp_kernel gebp_kernel; Matrix buffer; // let's process the block per panel of actual_mc x BlockSize, diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index fd0a7c2b2..be9362958 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -129,7 +129,7 @@ struct ei_product_triangular_matrix_matrix > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_lhs pack_lhs; ei_gemm_pack_rhs pack_rhs; @@ -254,10 +254,10 @@ struct ei_product_triangular_matrix_matrix > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_lhs pack_lhs; ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_rhs pack_rhs_panel; + ei_gemm_pack_rhs pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k20; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 7038627fb..0fce7159e 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -74,9 +74,9 @@ struct ei_triangular_solve_matrix conj; - ei_gebp_kernel > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_lhs pack_lhs; - ei_gemm_pack_rhs pack_rhs; + ei_gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? 0 : size; IsLower ? k20; @@ -212,9 +212,9 @@ struct ei_triangular_solve_matrix conj; - ei_gebp_kernel > gebp_kernel; + ei_gebp_kernel gebp_kernel; ei_gemm_pack_rhs pack_rhs; - ei_gemm_pack_rhs pack_rhs_panel; + ei_gemm_pack_rhs pack_rhs_panel; ei_gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index cd0ed0ede..38c86511c 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -29,12 +29,7 @@ // implement and control fast level 2 and level 3 BLAS-like routines. // forward declarations - -// Provides scalar/packet-wise product and product with accumulation -// with optional conjugation of the arguments. -template struct ei_conj_helper; - -template > +template struct ei_gebp_kernel; template @@ -58,42 +53,40 @@ template struct ei_conj_helper +template struct ei_conj_helper { - template - EIGEN_STRONG_INLINE T pmadd(const T& x, const T& y, const T& c) const { return ei_pmadd(x,y,c); } - template - EIGEN_STRONG_INLINE T pmul(const T& x, const T& y) const { return ei_pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return ei_pmadd(x,y,c); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return ei_pmul(x,y); } }; -template<> struct ei_conj_helper +template struct ei_conj_helper, std::complex, false,true> { - template std::complex - pmadd(const std::complex& x, const std::complex& y, const std::complex& c) const + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template std::complex pmul(const std::complex& x, const std::complex& y) const - { return std::complex(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_imag(x)*ei_real(y) - ei_real(x)*ei_imag(y)); } }; -template<> struct ei_conj_helper +template struct ei_conj_helper, std::complex, true,false> { - template std::complex - pmadd(const std::complex& x, const std::complex& y, const std::complex& c) const + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template std::complex pmul(const std::complex& x, const std::complex& y) const - { return std::complex(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) + ei_imag(x)*ei_imag(y), ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; -template<> struct ei_conj_helper +template struct ei_conj_helper, std::complex, true,true> { - template std::complex - pmadd(const std::complex& x, const std::complex& y, const std::complex& c) const + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - template std::complex pmul(const std::complex& x, const std::complex& y) const - { return std::complex(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const + { return Scalar(ei_real(x)*ei_real(y) - ei_imag(x)*ei_imag(y), - ei_real(x)*ei_imag(y) - ei_imag(x)*ei_real(y)); } }; // Lightweight helper class to access matrix coefficients. diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index ef923d867..a757bef76 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -106,6 +106,10 @@ template::value> struct ProductReturnType; +// Provides scalar/packet-wise product and product with accumulation +// with optional conjugation of the arguments. +template struct ei_conj_helper; + template struct ei_scalar_sum_op; template struct ei_scalar_difference_op; template struct ei_scalar_product_op; diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 49a0d8f5d..94bb5569e 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -324,7 +324,7 @@ void /*EIGEN_DONT_INLINE*/ ei_apply_rotation_in_the_plane(VectorX& _x, VectorY& const Packet pc = ei_pset1(Scalar(j.c())); const Packet ps = ei_pset1(Scalar(j.s())); - ei_conj_helper::IsComplex,false> cj; + ei_conj_helper::IsComplex,false> pcj; for(Index i=0; i