diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 68fd7678e..c591cc1f9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1103,7 +1103,7 @@ EIGEN_ASM_COMMENT("mybegin4"); #undef CJMADD // pack a block of the lhs -// The travesal is as follow (mr==4): +// The traversal is as follow (mr==4): // 0 4 8 12 ... // 1 5 9 13 ... // 2 6 10 14 ... @@ -1119,11 +1119,15 @@ EIGEN_ASM_COMMENT("mybegin4"); template struct gemm_pack_lhs { - void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0) { -// enum { PacketSize = packet_traits::size }; + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + eigen_assert( (StorageOrder==RowMajor) || (Pack1%PacketSize)==0 && Pack1<=4*PacketSize); conj_if::IsComplex && Conjugate> cj; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; @@ -1131,9 +1135,44 @@ struct gemm_pack_lhs for(Index i=0; i=1*PacketSize) A = ploadu(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj(D)); count+=PacketSize; } + } + } + else + { + for(Index k=0; k=Pack2) @@ -1167,9 +1206,10 @@ struct gemm_pack_rhs { typedef typename packet_traits::type Packet; enum { PacketSize = packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index packet_cols = (cols/nr) * nr; @@ -1214,9 +1254,10 @@ template { enum { PacketSize = packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0) { + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index packet_cols = (cols/nr) * nr;