diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e1c258ff0..9c9dd52f2 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -136,7 +136,7 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st // FIXME #ifndef EIGEN_HAS_FUSE_CJMADD #define EIGEN_HAS_FUSE_CJMADD -#endif +#endif #ifdef EIGEN_HAS_FUSE_CJMADD #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else @@ -144,7 +144,7 @@ inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, st #endif /* optimized GEneral packed Block * packed Panel product kernel - * + * * Mixing type logic: C += A * B * | A | B | comments * |real |cplx | no vectorization yet, would require to pack A with duplication @@ -156,10 +156,7 @@ struct ei_gebp_kernel typedef typename ei_scalar_product_traits::ReturnType ResScalar; enum { - Vectorizable = ei_product_blocking_traits::Vectorizable /*ei_packet_traits::Vectorizable - && ei_packet_traits::Vectorizable - && (ei_is_same_type::ret - || (NumTraits::IsComplex && !NumTraits::IsComplex))*/, + Vectorizable = ei_product_blocking_traits::Vectorizable, LhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, RhsPacketSize = Vectorizable ? ei_packet_traits::size : 1, ResPacketSize = Vectorizable ? ei_packet_traits::size : 1 @@ -173,7 +170,7 @@ struct ei_gebp_kernel typedef typename ei_meta_if::ret RhsPacket; typedef typename ei_meta_if::ret ResPacket; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) { if(strideA==-1) strideA = depth; @@ -447,6 +444,7 @@ EIGEN_ASM_COMMENT("myend"); } ResPacket R0, R1, R2, R3, R4, R5, R6, R7; + ResPacket alphav = ei_pset1(alpha); R0 = ei_ploadu(r0); R1 = ei_ploadu(r1); @@ -457,14 +455,14 @@ EIGEN_ASM_COMMENT("myend"); if(nr==4) R6 = ei_ploadu(r2 + ResPacketSize); if(nr==4) R7 = ei_ploadu(r3 + ResPacketSize); - C0 = ei_padd(R0, C0); - C1 = ei_padd(R1, C1); - if(nr==4) C2 = ei_padd(R2, C2); - if(nr==4) C3 = ei_padd(R3, C3); - C4 = ei_padd(R4, C4); - C5 = ei_padd(R5, C5); - if(nr==4) C6 = ei_padd(R6, C6); - if(nr==4) C7 = ei_padd(R7, C7); + C0 = ei_pmadd(C0, alphav, R0); + C1 = ei_pmadd(C1, alphav, R1); + if(nr==4) C2 = ei_pmadd(C2, alphav, R2); + if(nr==4) C3 = ei_pmadd(C3, alphav, R3); + C4 = ei_pmadd(C4, alphav, R4); + C5 = ei_pmadd(C5, alphav, R5); + if(nr==4) C6 = ei_pmadd(C6, alphav, R6); + if(nr==4) C7 = ei_pmadd(C7, alphav, R7); ei_pstoreu(r0, C0); ei_pstoreu(r1, C1); @@ -483,10 +481,10 @@ EIGEN_ASM_COMMENT("myend"); // gets res block as register ResPacket C0, C1, C2, C3; - C0 = ei_ploadu(&res[(j2+0)*resStride + i]); - C1 = ei_ploadu(&res[(j2+1)*resStride + i]); - if(nr==4) C2 = ei_ploadu(&res[(j2+2)*resStride + i]); - if(nr==4) C3 = ei_ploadu(&res[(j2+3)*resStride + i]); + C0 = ei_pset1(ResScalar(0)); + C1 = ei_pset1(ResScalar(0)); + if(nr==4) C2 = ei_pset1(ResScalar(0)); + if(nr==4) C3 = ei_pset1(ResScalar(0)); // performs "inner" product const RhsScalar* blB = unpackedB; @@ -573,24 +571,18 @@ EIGEN_ASM_COMMENT("myend"); if(nr==2) { LhsPacket A0; - RhsPacket B0; - #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0; - #endif + RhsPacket B0, B1; A0 = ei_pload(&blA[0*LhsPacketSize]); B0 = ei_pload(&blB[0*RhsPacketSize]); - MADD(pcj,A0,B0,C0,T0); - B0 = ei_pload(&blB[1*RhsPacketSize]); - MADD(pcj,A0,B0,C1,T0); + B1 = ei_pload(&blB[1*RhsPacketSize]); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); } else { LhsPacket A0; RhsPacket B0, B1, B2, B3; - #ifndef EIGEN_HAS_FUSE_CJMADD - RhsPacket T0, T1; - #endif A0 = ei_pload(&blA[0*LhsPacketSize]); B0 = ei_pload(&blB[0*RhsPacketSize]); @@ -598,20 +590,38 @@ EIGEN_ASM_COMMENT("myend"); B2 = ei_pload(&blB[2*RhsPacketSize]); B3 = ei_pload(&blB[3*RhsPacketSize]); - MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A0,B1,C1,T1); - MADD(pcj,A0,B2,C2,T0); - MADD(pcj,A0,B3,C3,T1); + MADD(pcj,A0,B0,C0,B0); + MADD(pcj,A0,B1,C1,B1); + MADD(pcj,A0,B2,C2,B2); + MADD(pcj,A0,B3,C3,B3); } blB += nr*RhsPacketSize; blA += LhsPacketSize; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+1)*resStride + i], C1); - if(nr==4) ei_pstoreu(&res[(j2+2)*resStride + i], C2); - if(nr==4) ei_pstoreu(&res[(j2+3)*resStride + i], C3); + ResPacket R0, R1, R2, R3; + ResPacket alphav = ei_pset1(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ei_ploadu(r0); + R1 = ei_ploadu(r1); + if(nr==4) R2 = ei_ploadu(r2); + if(nr==4) R3 = ei_ploadu(r3); + + C0 = ei_pmadd(C0, alphav, R0); + C1 = ei_pmadd(C1, alphav, R1); + if(nr==4) C2 = ei_pmadd(C2, alphav, R2); + if(nr==4) C3 = ei_pmadd(C3, alphav, R3); + + ei_pstoreu(r0, C0); + ei_pstoreu(r1, C1); + if(nr==4) ei_pstoreu(r2, C2); + if(nr==4) ei_pstoreu(r3, C3); } for(Index i=peeled_mc2; i(&res[(j2+0)*resStride + i]); - C4 = ei_ploadu(&res[(j2+0)*resStride + i + ResPacketSize]); + C0 = ei_pset1(ResScalar(0)); + C4 = ei_pset1(ResScalar(0)); + const RhsScalar* blB = unpackedB; for(Index k=0; k(&blA[0*LhsPacketSize]); A1 = ei_pload(&blA[1*LhsPacketSize]); B0 = ei_pload(&blB[0*RhsPacketSize]); MADD(pcj,A0,B0,C0,T0); - MADD(pcj,A1,B0,C4,T1); + MADD(pcj,A1,B0,C4,B0); blB += RhsPacketSize; blA += mr; } + ResPacket R0, R4; + ResPacket alphav = ei_pset1(alpha); - ei_pstoreu(&res[(j2+0)*resStride + i], C0); - ei_pstoreu(&res[(j2+0)*resStride + i + ResPacketSize], C4); + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ei_ploadu(r0); + R4 = ei_ploadu(r0+ResPacketSize); + + C0 = ei_pmadd(C0, alphav, R0); + C4 = ei_pmadd(C4, alphav, R4); + + ei_pstoreu(r0, C0); + ei_pstoreu(r0+ResPacketSize, C4); } if(rows-peeled_mc>=LhsPacketSize) { @@ -718,20 +733,21 @@ EIGEN_ASM_COMMENT("myend"); const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsPacketSize]; ei_prefetch(&blA[0]); - ResPacket C0 = ei_ploadu(&res[(j2+0)*resStride + i]); + ResPacket C0 = ei_pset1(ResScalar(0)); const RhsScalar* blB = unpackedB; for(Index k=0; k(blA), ei_pload(blB), C0, T0); + LhsPacket A0 = ei_pload(blA); + RhsPacket B0 = ei_pload(blB); + MADD(pcj, A0, B0, C0, B0); blB += RhsPacketSize; blA += LhsPacketSize; } - ei_pstoreu(&res[(j2+0)*resStride + i], C0); + ResPacket alphav = ei_pset1(alpha); + ResPacket R0 = ei_ploadu(&res[(j2+0)*resStride + i]); + ei_pstoreu(&res[(j2+0)*resStride + i], ei_pmadd(C0, alphav, R0)); } for(Index i=peeled_mc2; i::size }; ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; ei_const_blas_data_mapper lhs(_lhs,lhsStride); - bool hasAlpha = alpha != Scalar(1); Index count = 0; Index peeled_mc = (rows/mr)*mr; for(Index i=0; i=PacketSize) { if(PanelMode) count += PacketSize*offset; - if(hasAlpha) - for(Index k=0; k { typedef typename ei_packet_traits::type Packet; enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2 const Scalar* b1 = &rhs[(j2+1)*rhsStride]; const Scalar* b2 = &rhs[(j2+2)*rhsStride]; const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - if (hasAlpha) - for(Index k=0; k { if(PanelMode) count += offset; const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - if (hasAlpha) - for(Index k=0; k { enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Scalar alpha, Index depth, Index cols, + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { ei_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); ei_conj_if::IsComplex && Conjugate> cj; - bool hasAlpha = alpha != Scalar(1); Index packet_cols = (cols/nr) * nr; Index count = 0; for(Index j2=0; j2 const Scalar* b0 = &rhs[j2]; for(Index k=0; k rhs(_rhs,rhsStride); typedef ei_product_blocking_traits Blocking; - bool alphaOnLhs = NumTraits::IsComplex && !NumTraits::IsComplex; Index kc = blocking.kc(); // cache block size along the K direction Index mc = std::min(rows,blocking.mc()); // cache block size along the M direction @@ -87,8 +86,6 @@ static void run(Index rows, Index cols, Index depth, ei_gemm_pack_rhs pack_rhs; ei_gebp_kernel gebp; -// if ((ConjugateRhs && !alphaOnLhs) || (ConjugateLhs && alphaOnLhs)) -// alpha = ei_conj(alpha); // ei_gemm_pack_lhs pack_lhs; // ei_gemm_pack_rhs pack_rhs; // ei_gebp_kernel gebp; @@ -178,9 +175,6 @@ static void run(Index rows, Index cols, Index depth, RhsScalar *blockB = blocking.blockB()==0 ? ei_aligned_stack_new(RhsScalar, sizeB) : blocking.blockB(); RhsScalar *blockW = blocking.blockW()==0 ? ei_aligned_stack_new(RhsScalar, sizeW) : blocking.blockW(); - LhsScalar lhsAlpha = alphaOnLhs ? ei_get_factor::run(alpha) : LhsScalar(1); - RhsScalar rhsAlpha = alphaOnLhs ? RhsScalar(1) : ei_get_factor::run(alpha); - // For each horizontal panel of the rhs, and corresponding panel of the lhs... // (==GEMM_VAR1) for(Index k2=0; k2 Pack rhs's panel into a sequential chunk of memory (L2 caching) // Note that this panel will be read as many times as the number of blocks in the lhs's // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, rhsAlpha, actual_kc, cols); + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); // For each mc x kc block of the lhs's vertical panel... @@ -203,10 +197,10 @@ static void run(Index rows, Index cols, Index depth, // We pack the lhs's block into a sequential chunk of memory (L1 caching) // Note that this block will be read a very high number of times, which is equal to the number of // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc, lhsAlpha); + pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, -1, -1, 0, 0, blockW); + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); } } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 27eb4b2f8..0488114b9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -89,7 +89,7 @@ template struct ei_symm_pack_rhs { enum { PacketSize = ei_packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Scalar alpha, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; @@ -101,12 +101,12 @@ struct ei_symm_pack_rhs { for(Index k=k2; k lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits Blocking; Index kc = size; // cache block size along the K direction @@ -282,7 +279,7 @@ struct ei_product_selfadjoint_matrix transposed packed copy @@ -294,7 +291,7 @@ struct ei_product_selfadjoint_matrix() (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } @@ -338,9 +335,6 @@ struct ei_product_selfadjoint_matrix lhs(_lhs,lhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits Blocking; Index kc = size; // cache block size along the K direction @@ -361,7 +355,7 @@ struct ei_product_selfadjoint_matrix GEPP for(Index i2=0; i2 mat(_mat,matStride); - if(AAT) - alpha = ei_conj(alpha); +// if(AAT) +// alpha = ei_conj(alpha); typedef ei_product_blocking_traits Blocking; @@ -99,7 +99,7 @@ struct ei_selfadjoint_product processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, std::min(size,i2), alpha, -1, -1, 0, 0, allocatedBlockB); - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, allocatedBlockB); + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0),size-j2), + gebp_kernel(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, std::max(Index(0), size-j2), alpha, -1, -1, 0, 0, allocatedBlockB); } } @@ -173,7 +173,7 @@ struct ei_sybb_kernel PacketSize = ei_packet_traits::size, BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) }; - void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar* workspace) + void operator()(Scalar* res, Index resStride, const Scalar* blockA, const Scalar* blockB, Index size, Index depth, Scalar alpha, Scalar* workspace) { ei_gebp_kernel gebp_kernel; Matrix buffer; @@ -186,14 +186,15 @@ struct ei_sybb_kernel const Scalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, -1, -1, 0, 0, workspace); + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, -1, -1, 0, 0, workspace); // 2 - triangular accumulation for(Index j1=0; j1 lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits Blocking; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), @@ -147,7 +144,7 @@ struct ei_product_triangular_matrix_matrix skip it @@ -176,7 +173,7 @@ struct ei_product_triangular_matrix_matrix() (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols); + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -231,9 +228,6 @@ struct ei_product_triangular_matrix_matrix lhs(_lhs,lhsStride); ei_const_blas_data_mapper rhs(_rhs,rhsStride); - if (ConjugateRhs) - alpha = ei_conj(alpha); - typedef ei_product_blocking_traits Blocking; enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Blocking::mr,Blocking::nr), @@ -280,7 +274,7 @@ struct ei_product_triangular_matrix_matrix0) @@ -293,7 +287,7 @@ struct ei_product_triangular_matrix_matrix0) @@ -149,7 +149,7 @@ struct ei_triangular_solve_matrix0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, -1, + &rhs(actual_k2+panelOffset, actual_j2), triStride, panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -273,6 +273,7 @@ struct ei_triangular_solve_matrix0) gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, - actual_mc, actual_kc, rs, + actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0, allocatedBlockB); } }