diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 3c1a7fc40..098b7c4e1 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2009 Gael Guennebaud +// Copyright (C) 2008-2016 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -15,10 +15,8 @@ namespace Eigen { namespace internal { /* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -27,33 +25,7 @@ namespace internal { * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * */ template struct general_matrix_vector_product @@ -87,238 +59,145 @@ EIGEN_DONT_INLINE static void run( template EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load(j), ptmp0), \ - pcj.pmul(lhs1.template load(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load(j), ptmp2), \ - pcj.pmul(lhs3.template load(j), ptmp3)) ))) - typedef typename LhsMapper::VectorMapper LhsScalars; + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); conj_helper cj; conj_helper pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; - const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned }; - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1(alpha); - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) + for(Index j2=0; j2 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) - { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)), + c4 = pset1(ResScalar(0)), + c5 = pset1(ResScalar(0)), + c6 = pset1(ResScalar(0)), + c7 = pset1(ResScalar(0)); - /* eigen_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; - - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i(alpha*rhs(i, 0)), - ptmp1 = pset1(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1(alpha*rhs(i+2, 0)), - ptmp3 = pset1(alpha*rhs(i+offset3, 0)); - - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); - - if (Vectorizable) - { - /* explicit vectorization */ - // process initial unaligned coeffs - for (Index j=0; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load(i+LhsPacketSize*7,j),b0,c7); } - - if (alignedSize>alignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load(alignedStart-1); - A02 = lhs2.template load(alignedStart-2); - A03 = lhs3.template load(alignedStart-3); - - for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load(j); - A10 = lhs0.template load(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu(res+i+ResPacketSize*7))); + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)), + c3 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + + i+=ResPacketSize*4; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)), + c2 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + + i+=ResPacketSize*3; + } + if(i(ResScalar(0)), + c1 = pset1(ResScalar(0)); + + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i(ResScalar(0)); + for(Index j=j2; j(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + for(;i(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) - { - /* explicit vectorization */ - // process first unaligned result's coeffs - for (Index j=0; j(alignedStart)) - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - else - for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); - } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i(&operator()(i, j)); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + //return ploadt(&operator()(i, j)); + return ploadu(m_data+j*m_stride+i); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { return ploadt(&operator()(i, j)); }