This commit is contained in:
Everton Constantino 2021-05-10 19:59:47 +00:00
parent b2cd094863
commit 70c0363c28
3 changed files with 518 additions and 492 deletions

View File

@ -14,494 +14,470 @@ namespace Eigen {
namespace internal { namespace internal {
template<int CPU, typename LhsScalar, typename RhsScalar> // template<int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9; // constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9;
template<int CPU, typename LhsScalar, typename RhsScalar> // template<int CPU, typename LhsScalar, typename RhsScalar>
constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] = // constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
{ /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END}, // { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
/*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END}, // /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END},
/*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END}, // /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END},
/*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END}, // /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END},
/*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END}, // /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END},
/*5*/ {4,4,4, 2, 2, 3}, // /*5*/ {4,4,4, 2, 2, 3},
/*6*/ {4,8,4, 2, 2, 4}, // /*6*/ {4,8,4, 2, 2, 4},
/*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END}, // /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END},
/*8*/ {8,4,4, 2, 4, 6}}; // /*8*/ {8,4,4, 2, 4, 6}};
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> // template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1> // struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
{ // {
using LinearMapper = typename DataMapper::LinearMapper; // using LinearMapper = typename DataMapper::LinearMapper;
using AccPacket = typename packet_traits<Scalar>::type; // using AccPacket = typename packet_traits<Scalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type; // using ResPacket = typename packet_traits<ResScalar>::type;
PacketBlock<AccPacket,2> _acc; // PacketBlock<AccPacket,2> _acc;
EIGEN_STRONG_INLINE void zero() // EIGEN_STRONG_INLINE void zero()
{ // {
_acc.packet[0] = pset1<AccPacket>(0); // _acc.packet[0] = pset1<AccPacket>(0);
_acc.packet[1] = pset1<AccPacket>(0); // _acc.packet[1] = pset1<AccPacket>(0);
} // }
template<typename ResPacket_> // template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) // EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
{ // {
_acc.packet[0] *= pAlpha; // _acc.packet[0] *= pAlpha;
_acc.packet[1] *= pAlpha; // _acc.packet[1] *= pAlpha;
} // }
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) // EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{ // {
//eigen_assert(false && "4x1"); // PacketBlock<ResPacket, 1> block;
// block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
// dest.template storePacketBlock<AccPacket, 1>(row, col, block);
// block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
// dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
// }
// };
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
// {
// using LinearMapper = typename DataMapper::LinearMapper;
// using AccPacket = typename packet_traits<Scalar>::type;
// using ResPacket = typename packet_traits<ResScalar>::type;
// AccPacket _acc;
// EIGEN_STRONG_INLINE void zero()
// {
// _acc = pset1<AccPacket>(0);
// }
// template<typename ResPacket_>
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
// {
// _acc *= pAlpha;
// }
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
// {
// PacketBlock<ResPacket, 1> block;
// block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
// dest.template storePacketBlock<AccPacket, 1>(row, col, block);
// }
// };
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
// {
// using LinearMapper = typename DataMapper::LinearMapper;
// using AccPacket = typename packet_traits<Scalar>::type;
// using ResPacket = typename packet_traits<ResScalar>::type;
// AccPacket _acc;
// EIGEN_STRONG_INLINE void zero()
// {
// _acc = pset1<AccPacket>(0);
// }
// template<typename ResPacket_>
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
// {
// _acc *= pAlpha;
// }
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
// {
// ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
// dest.template scatterPacket<ResPacket>(row, col, r);
// }
// };
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
// {
// using LinearMapper = typename DataMapper::LinearMapper;
// using AccPacket = typename packet_traits<Scalar>::type;
// using ResPacket = typename packet_traits<ResScalar>::type;
// PacketBlock<AccPacket, 4> _acc;
// EIGEN_STRONG_INLINE void zero()
// {
// _acc.packet[0] = pset1<AccPacket>(0);
// _acc.packet[1] = pset1<AccPacket>(0);
// _acc.packet[2] = pset1<AccPacket>(0);
// _acc.packet[3] = pset1<AccPacket>(0);
// }
// template<typename ResPacket_>
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
// {
// _acc.packet[0] *= pAlpha;
// _acc.packet[1] *= pAlpha;
// _acc.packet[2] *= pAlpha;
// _acc.packet[3] *= pAlpha;
// }
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
// {
// LinearMapper r0 = dest.getLinearMapper(row, col + 0); // LinearMapper r0 = dest.getLinearMapper(row, col + 0);
// LinearMapper r1 = dest.getLinearMapper(row, col + 1);
//r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc); // LinearMapper r2 = dest.getLinearMapper(row, col + 2);
PacketBlock<ResPacket, 1> block; // LinearMapper r3 = dest.getLinearMapper(row, col + 3);
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
dest.template storePacketBlock<AccPacket, 1>(row, col, block); // r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1]; // r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block); // r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
/* // r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
dest(row + 0, col + 0) += _acc.packet[0][0]; // }
dest(row + 1, col + 0) += _acc.packet[0][1]; // };
dest(row + 2, col + 0) += _acc.packet[0][2];
dest(row + 3, col + 0) += _acc.packet[0][3]; // template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
dest(row + 4, col + 0) += _acc.packet[1][0]; // {
dest(row + 5, col + 0) += _acc.packet[1][1]; // using LinearMapper = typename DataMapper::LinearMapper;
dest(row + 6, col + 0) += _acc.packet[1][2]; // using AccPacket = typename packet_traits<Scalar>::type;
dest(row + 7, col + 0) += _acc.packet[1][3]; // using ResPacket = typename packet_traits<ResScalar>::type;
*/
} // PacketBlock<AccPacket, 4> _acc1;
}; // PacketBlock<AccPacket, 4> _acc2;
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> // EIGEN_STRONG_INLINE void zero()
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> // {
{ // _acc1.packet[0] = pset1<AccPacket>(0);
using LinearMapper = typename DataMapper::LinearMapper; // _acc1.packet[1] = pset1<AccPacket>(0);
using AccPacket = typename packet_traits<Scalar>::type; // _acc1.packet[2] = pset1<AccPacket>(0);
using ResPacket = typename packet_traits<ResScalar>::type; // _acc1.packet[3] = pset1<AccPacket>(0);
AccPacket _acc; // _acc2.packet[0] = pset1<AccPacket>(0);
// _acc2.packet[1] = pset1<AccPacket>(0);
EIGEN_STRONG_INLINE void zero() // _acc2.packet[2] = pset1<AccPacket>(0);
{ // _acc2.packet[3] = pset1<AccPacket>(0);
_acc = pset1<AccPacket>(0); // }
}
// template<typename ResPacket_>
template<typename ResPacket_> // EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) // {
{ // _acc1.packet[0] *= pAlpha;
_acc *= pAlpha; // _acc1.packet[1] *= pAlpha;
} // _acc1.packet[2] *= pAlpha;
// _acc1.packet[3] *= pAlpha;
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
{ // _acc2.packet[0] *= pAlpha;
//eigen_assert(false && "4x1"); // _acc2.packet[1] *= pAlpha;
//LinearMapper r0 = dest.getLinearMapper(row, col + 0); // _acc2.packet[2] *= pAlpha;
// _acc2.packet[3] *= pAlpha;
//r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc); // }
PacketBlock<ResPacket, 1> block;
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc; // EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
dest.template storePacketBlock<AccPacket, 1>(row, col, block); // {
/* // LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
dest(row + 0, col) += _acc[0]; // LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
dest(row + 1, col) += _acc[1]; // LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
dest(row + 2, col) += _acc[2]; // LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
dest(row + 3, col) += _acc[3];
*/ // LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
} // LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
}; // LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
// LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
{ // r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
using LinearMapper = typename DataMapper::LinearMapper; // r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
using AccPacket = typename packet_traits<Scalar>::type; // r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
using ResPacket = typename packet_traits<ResScalar>::type; // r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
AccPacket _acc; // r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
// r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
EIGEN_STRONG_INLINE void zero() // r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
{ // r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
_acc = pset1<AccPacket>(0); // }
} // };
template<typename ResPacket_> // #define MICRO_8x1x4() \
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) // pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
{ // lhsPackMap.advance(4*1); \
_acc *= pAlpha; // pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
} // pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
// pRhs0 = pset1<RhsPacket>(pRhs[0]); \
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) // pRhs1 = pset1<RhsPacket>(pRhs[1]); \
{ // pRhs2 = pset1<RhsPacket>(pRhs[2]); \
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc; // pRhs3 = pset1<RhsPacket>(pRhs[3]); \
dest.template scatterPacket<ResPacket>(row, col, r); // acc._acc1.packet[0] += pLhs*pRhs0; \
} // acc._acc1.packet[1] += pLhs*pRhs1; \
}; // acc._acc1.packet[2] += pLhs*pRhs2; \
// acc._acc1.packet[3] += pLhs*pRhs3; \
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> // acc._acc2.packet[0] += pLhs2*pRhs0; \
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> // acc._acc2.packet[1] += pLhs2*pRhs1; \
{ // acc._acc2.packet[2] += pLhs2*pRhs2; \
using LinearMapper = typename DataMapper::LinearMapper; // acc._acc2.packet[3] += pLhs2*pRhs3; \
using AccPacket = typename packet_traits<Scalar>::type; // lhsPackMap.advance(4*1); \
using ResPacket = typename packet_traits<ResScalar>::type; // rhsPackMap.advance(1*4);
PacketBlock<AccPacket, 4> _acc; // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
EIGEN_STRONG_INLINE void zero() // {
{ // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
_acc.packet[0] = pset1<AccPacket>(0); // RhsPackMap& rhsPackMap,
_acc.packet[1] = pset1<AccPacket>(0); // Index rowIdx, Index colIdx, Index depthIdx,
_acc.packet[2] = pset1<AccPacket>(0); // Accumulator& acc)
_acc.packet[3] = pset1<AccPacket>(0); // {
} // using LhsPacket = typename packet_traits<LhsScalar>::type;
// using RhsPacket = typename packet_traits<RhsScalar>::type;
template<typename ResPacket_>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) // asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
{
_acc.packet[0] *= pAlpha; // LhsPacket pLhs, pLhs2;
_acc.packet[1] *= pAlpha; // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
_acc.packet[2] *= pAlpha;
_acc.packet[3] *= pAlpha; // MICRO_8x1x4();
} // MICRO_8x1x4();
// MICRO_8x1x4();
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) // MICRO_8x1x4();
{ // MICRO_8x1x4();
LinearMapper r0 = dest.getLinearMapper(row, col + 0); // MICRO_8x1x4();
LinearMapper r1 = dest.getLinearMapper(row, col + 1); // MICRO_8x1x4();
LinearMapper r2 = dest.getLinearMapper(row, col + 2); // MICRO_8x1x4();
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]); // };
r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]); // };
r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]); // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
} // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
}; // {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper> // RhsPackMap& rhsPackMap,
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> // Index rowIdx, Index colIdx, Index depthIdx,
{ // Accumulator& acc)
using LinearMapper = typename DataMapper::LinearMapper; // {
using AccPacket = typename packet_traits<Scalar>::type; // using LhsPacket = typename packet_traits<LhsScalar>::type;
using ResPacket = typename packet_traits<ResScalar>::type; // using RhsPacket = typename packet_traits<RhsScalar>::type;
PacketBlock<AccPacket, 4> _acc1; // asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
PacketBlock<AccPacket, 4> _acc2;
// LhsPacket pLhs, pLhs2;
EIGEN_STRONG_INLINE void zero() // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
{
_acc1.packet[0] = pset1<AccPacket>(0); // prefetch(lhsPackMap.pCur + 2*32);
_acc1.packet[1] = pset1<AccPacket>(0); // prefetch(rhsPackMap.pCur + 2*16);
_acc1.packet[2] = pset1<AccPacket>(0); // MICRO_8x1x4();
_acc1.packet[3] = pset1<AccPacket>(0); // MICRO_8x1x4();
// MICRO_8x1x4();
_acc2.packet[0] = pset1<AccPacket>(0); // MICRO_8x1x4();
_acc2.packet[1] = pset1<AccPacket>(0);
_acc2.packet[2] = pset1<AccPacket>(0); // asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
_acc2.packet[3] = pset1<AccPacket>(0); // };
} // };
template<typename ResPacket_> // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
{ // {
_acc1.packet[0] *= pAlpha; // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
_acc1.packet[1] *= pAlpha; // RhsPackMap& rhsPackMap,
_acc1.packet[2] *= pAlpha; // Index rowIdx, Index colIdx, Index depthIdx,
_acc1.packet[3] *= pAlpha; // Accumulator& acc)
// {
_acc2.packet[0] *= pAlpha; // using LhsPacket = typename packet_traits<LhsScalar>::type;
_acc2.packet[1] *= pAlpha; // using RhsPacket = typename packet_traits<RhsScalar>::type;
_acc2.packet[2] *= pAlpha;
_acc2.packet[3] *= pAlpha; // asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
}
// LhsPacket pLhs, pLhs2;
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
{
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0); // MICRO_8x1x4();
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2); // asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3); // };
// };
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1); // #define MICRO_4x1x4() \
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2); // pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3); // pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
// pRhs0 = pset1<RhsPacket>(pRhs[0]); \
// pRhs1 = pset1<RhsPacket>(pRhs[1]); \
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]); // pRhs2 = pset1<RhsPacket>(pRhs[2]); \
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]); // pRhs3 = pset1<RhsPacket>(pRhs[3]); \
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]); // acc._acc.packet[0] += pLhs*pRhs0; \
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]); // acc._acc.packet[1] += pLhs*pRhs1; \
// acc._acc.packet[2] += pLhs*pRhs2; \
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]); // acc._acc.packet[3] += pLhs*pRhs3; \
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]); // lhsPackMap.advance(4*1); \
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]); // rhsPackMap.advance(1*4);
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
} // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
}; // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
// {
#define MICRO_8x1x4() \ // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \ // RhsPackMap& rhsPackMap,
lhsPackMap.advance(4*1); \ // Index rowIdx, Index colIdx, Index depthIdx,
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \ // Accumulator& acc)
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \ // {
pRhs0 = pset1<RhsPacket>(pRhs[0]); \ // using LhsPacket = typename packet_traits<LhsScalar>::type;
pRhs1 = pset1<RhsPacket>(pRhs[1]); \ // using RhsPacket = typename packet_traits<RhsScalar>::type;
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
pRhs3 = pset1<RhsPacket>(pRhs[3]); \ // asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
acc._acc1.packet[0] += pLhs*pRhs0; \ // LhsPacket pLhs;
acc._acc1.packet[1] += pLhs*pRhs1; \ // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
acc._acc1.packet[2] += pLhs*pRhs2; \
acc._acc1.packet[3] += pLhs*pRhs3; \ // MICRO_4x1x4();
acc._acc2.packet[0] += pLhs2*pRhs0; \ // MICRO_4x1x4();
acc._acc2.packet[1] += pLhs2*pRhs1; \ // MICRO_4x1x4();
acc._acc2.packet[2] += pLhs2*pRhs2; \ // MICRO_4x1x4();
acc._acc2.packet[3] += pLhs2*pRhs3; \ // MICRO_4x1x4();
lhsPackMap.advance(4*1); \ // MICRO_4x1x4();
rhsPackMap.advance(1*4); // MICRO_4x1x4();
// MICRO_4x1x4();
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4> // asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
{ // };
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, // };
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx, // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
Accumulator& acc) // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
{ // {
using LhsPacket = typename packet_traits<LhsScalar>::type; // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
using RhsPacket = typename packet_traits<RhsScalar>::type; // RhsPackMap& rhsPackMap,
// Index rowIdx, Index colIdx, Index depthIdx,
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t"); // Accumulator& acc)
// {
LhsPacket pLhs, pLhs2; // using LhsPacket = typename packet_traits<LhsScalar>::type;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; // using RhsPacket = typename packet_traits<RhsScalar>::type;
MICRO_8x1x4(); // asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
MICRO_8x1x4(); // LhsPacket pLhs;
MICRO_8x1x4(); // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_8x1x4();
MICRO_8x1x4(); // MICRO_4x1x4();
MICRO_8x1x4(); // MICRO_4x1x4();
MICRO_8x1x4(); // MICRO_4x1x4();
MICRO_8x1x4(); // MICRO_4x1x4();
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t"); // };
}; // };
};
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4> // {
{ // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, // RhsPackMap& rhsPackMap,
RhsPackMap& rhsPackMap, // Index rowIdx, Index colIdx, Index depthIdx,
Index rowIdx, Index colIdx, Index depthIdx, // Accumulator& acc)
Accumulator& acc) // {
{ // using LhsPacket = typename packet_traits<LhsScalar>::type;
using LhsPacket = typename packet_traits<LhsScalar>::type; // using RhsPacket = typename packet_traits<RhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
// LhsPacket pLhs;
LhsPacket pLhs, pLhs2; // RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
// MICRO_4x1x4();
MICRO_8x1x4();
MICRO_8x1x4(); // asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
MICRO_8x1x4(); // };
MICRO_8x1x4(); // };
asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t"); // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
}; // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
}; // {
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> // RhsPackMap& rhsPackMap,
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4> // Index rowIdx, Index colIdx, Index depthIdx,
{ // Accumulator& acc)
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, // {
RhsPackMap& rhsPackMap, // using LhsPacket = typename packet_traits<LhsScalar>::type;
Index rowIdx, Index colIdx, Index depthIdx, // using RhsPacket = typename packet_traits<RhsScalar>::type;
Accumulator& acc)
{ // asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type; // LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
// RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
// acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
LhsPacket pLhs, pLhs2; // lhsPackMap.advance(4*1);
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; // pLhs = pload<LhsPacket>(lhsPackMap.pCur);
// acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
MICRO_8x1x4();
// lhsPackMap.advance(4*1);
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t"); // rhsPackMap.advance(1);
}; // asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
}; // };
// };
#define MICRO_4x1x4() \
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \ // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \ // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
pRhs0 = pset1<RhsPacket>(pRhs[0]); \ // {
pRhs1 = pset1<RhsPacket>(pRhs[1]); \ // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
pRhs2 = pset1<RhsPacket>(pRhs[2]); \ // RhsPackMap& rhsPackMap,
pRhs3 = pset1<RhsPacket>(pRhs[3]); \ // Index rowIdx, Index colIdx, Index depthIdx,
acc._acc.packet[0] += pLhs*pRhs0; \ // Accumulator& acc)
acc._acc.packet[1] += pLhs*pRhs1; \ // {
acc._acc.packet[2] += pLhs*pRhs2; \ // using LhsPacket = typename packet_traits<LhsScalar>::type;
acc._acc.packet[3] += pLhs*pRhs3; \ // using RhsPacket = typename packet_traits<RhsScalar>::type;
lhsPackMap.advance(4*1); \
rhsPackMap.advance(1*4); // asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> // LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> // RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, // //acc._acc += pRhs*pLhs;
RhsPackMap& rhsPackMap, // acc._acc = pmadd(pRhs, pLhs, acc._acc);
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc) // lhsPackMap.advance(4*1);
{ // rhsPackMap.advance(1);
using LhsPacket = typename packet_traits<LhsScalar>::type; // asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
using RhsPacket = typename packet_traits<RhsScalar>::type; // };
// };
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
LhsPacket pLhs; // template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
// {
MICRO_4x1x4(); // EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
MICRO_4x1x4(); // RhsPackMap& rhsPackMap,
MICRO_4x1x4(); // Index rowIdx, Index colIdx, Index depthIdx,
MICRO_4x1x4(); // Accumulator& acc)
MICRO_4x1x4(); // {
MICRO_4x1x4(); // using RhsPacket = typename packet_traits<RhsScalar>::type;
MICRO_4x1x4(); // using LhsPacket = typename packet_traits<LhsScalar>::type;
MICRO_4x1x4();
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
}; // LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
}; // RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator> // acc._acc += pLhs*pRhs;
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
{ // lhsPackMap.advance(1);
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, // rhsPackMap.advance(4*1);
RhsPackMap& rhsPackMap, // asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
Index rowIdx, Index colIdx, Index depthIdx, // };
Accumulator& acc) // };
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
MICRO_4x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
LhsPacket pLhs;
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
MICRO_4x1x4();
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
lhsPackMap.advance(4*1);
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
lhsPackMap.advance(4*1);
rhsPackMap.advance(1);
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using LhsPacket = typename packet_traits<LhsScalar>::type;
using RhsPacket = typename packet_traits<RhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
//acc._acc += pRhs*pLhs;
acc._acc = pmadd(pRhs, pLhs, acc._acc);
lhsPackMap.advance(4*1);
rhsPackMap.advance(1);
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
};
};
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
{
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
RhsPackMap& rhsPackMap,
Index rowIdx, Index colIdx, Index depthIdx,
Accumulator& acc)
{
using RhsPacket = typename packet_traits<RhsScalar>::type;
using LhsPacket = typename packet_traits<LhsScalar>::type;
asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
acc._acc += pLhs*pRhs;
lhsPackMap.advance(1);
rhsPackMap.advance(4*1);
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
};
};
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -321,12 +321,14 @@ struct LhsLoopStruct
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap) Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
{ {
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION]; constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS; DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress) for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
{ {
lhsPackMap.moveTo(rowIdx); lhsPackMap.moveTo(rowIdx);
rhsPackMap.moveTo(colIdx); rhsPackMap.moveTo(colIdx);
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
} }
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
@ -403,7 +405,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, co
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap); rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
asm __volatile__("#END_GEBP\n\t"); asm __volatile__("#END_GEBP\n\t");
} }
/*
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{ {
@ -459,7 +461,7 @@ void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Con
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack; lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
pack(blockA, lhs, depth, rows, stride, offset); pack(blockA, lhs, depth, rows, stride, offset);
} }
*/
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
{ {

View File

@ -14,6 +14,54 @@ namespace Eigen {
namespace internal { namespace internal {
template<int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
struct PackMap<0, CPU, Index, Scalar, DataMapper, isLhs>
{
const Scalar *pBase;
const Scalar *pCur;
Index stride;
Index offset;
Index d2Size;
Index shift;
Index jump;
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset)
{
shift = (d2Size / 4) * 4;
jump = shift;
}
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
EIGEN_STRONG_INLINE void moveTo(Index p1)
{
Index offset;
if(isLhs)
{
if(p1 >= shift)
{
offset = static_cast<Index>(shift*d2Size + ((p1%4))*d2Size);
jump = 1;
} else {
offset = p1;
jump = shift;
}
} else {
offset = static_cast<Index>(4*d2Size*(p1/4));
pCur = pBase + offset;
}
pCur = pBase + offset;
}
EIGEN_STRONG_INLINE void advance(int progress)
{
Index offset = static_cast<Index>(isLhs ? jump : progress);
pCur += offset;
}
};
/*
template<int CPU, typename Scalar, bool isLhs> template<int CPU, typename Scalar, bool isLhs>
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3; constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
@ -180,7 +228,7 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane
return c; return c;
} }
}; };
*/
} // end namespace internal } // end namespace internal