mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-10-17 02:21:29 +08:00
WIP2
This commit is contained in:
parent
b2cd094863
commit
70c0363c28
@ -14,494 +14,470 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<int CPU, typename LhsScalar, typename RhsScalar>
|
// template<int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9;
|
// constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9;
|
||||||
|
|
||||||
template<int CPU, typename LhsScalar, typename RhsScalar>
|
// template<int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
// constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||||
{ /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
// { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END},
|
// /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END},
|
||||||
/*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END},
|
// /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END},
|
||||||
/*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
// /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END},
|
// /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END},
|
||||||
/*5*/ {4,4,4, 2, 2, 3},
|
// /*5*/ {4,4,4, 2, 2, 3},
|
||||||
/*6*/ {4,8,4, 2, 2, 4},
|
// /*6*/ {4,8,4, 2, 2, 4},
|
||||||
/*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END},
|
// /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END},
|
||||||
/*8*/ {8,4,4, 2, 4, 6}};
|
// /*8*/ {8,4,4, 2, 4, 6}};
|
||||||
|
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||||
{
|
// {
|
||||||
using LinearMapper = typename DataMapper::LinearMapper;
|
// using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
using AccPacket = typename packet_traits<Scalar>::type;
|
// using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
// using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
PacketBlock<AccPacket,2> _acc;
|
// PacketBlock<AccPacket,2> _acc;
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void zero()
|
// EIGEN_STRONG_INLINE void zero()
|
||||||
{
|
// {
|
||||||
_acc.packet[0] = pset1<AccPacket>(0);
|
// _acc.packet[0] = pset1<AccPacket>(0);
|
||||||
_acc.packet[1] = pset1<AccPacket>(0);
|
// _acc.packet[1] = pset1<AccPacket>(0);
|
||||||
}
|
// }
|
||||||
|
|
||||||
template<typename ResPacket_>
|
// template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
// {
|
||||||
_acc.packet[0] *= pAlpha;
|
// _acc.packet[0] *= pAlpha;
|
||||||
_acc.packet[1] *= pAlpha;
|
// _acc.packet[1] *= pAlpha;
|
||||||
}
|
// }
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
// {
|
||||||
//eigen_assert(false && "4x1");
|
// PacketBlock<ResPacket, 1> block;
|
||||||
//LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
// block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
||||||
|
// dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||||
//r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc);
|
// block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||||
PacketBlock<ResPacket, 1> block;
|
// dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
// }
|
||||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
// };
|
||||||
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
|
||||||
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
/*
|
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||||
dest(row + 0, col + 0) += _acc.packet[0][0];
|
// {
|
||||||
dest(row + 1, col + 0) += _acc.packet[0][1];
|
// using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
dest(row + 2, col + 0) += _acc.packet[0][2];
|
// using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
dest(row + 3, col + 0) += _acc.packet[0][3];
|
// using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
dest(row + 4, col + 0) += _acc.packet[1][0];
|
// AccPacket _acc;
|
||||||
dest(row + 5, col + 0) += _acc.packet[1][1];
|
|
||||||
dest(row + 6, col + 0) += _acc.packet[1][2];
|
// EIGEN_STRONG_INLINE void zero()
|
||||||
dest(row + 7, col + 0) += _acc.packet[1][3];
|
// {
|
||||||
*/
|
// _acc = pset1<AccPacket>(0);
|
||||||
}
|
// }
|
||||||
};
|
|
||||||
|
// template<typename ResPacket_>
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
// {
|
||||||
{
|
// _acc *= pAlpha;
|
||||||
using LinearMapper = typename DataMapper::LinearMapper;
|
// }
|
||||||
using AccPacket = typename packet_traits<Scalar>::type;
|
|
||||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
|
// {
|
||||||
AccPacket _acc;
|
// PacketBlock<ResPacket, 1> block;
|
||||||
|
// block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
|
||||||
EIGEN_STRONG_INLINE void zero()
|
// dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||||
{
|
// }
|
||||||
_acc = pset1<AccPacket>(0);
|
// };
|
||||||
}
|
|
||||||
|
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
template<typename ResPacket_>
|
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
// {
|
||||||
{
|
// using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
_acc *= pAlpha;
|
// using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
}
|
// using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
// AccPacket _acc;
|
||||||
{
|
|
||||||
//eigen_assert(false && "4x1");
|
// EIGEN_STRONG_INLINE void zero()
|
||||||
//LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
// {
|
||||||
|
// _acc = pset1<AccPacket>(0);
|
||||||
//r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc);
|
// }
|
||||||
PacketBlock<ResPacket, 1> block;
|
|
||||||
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc;
|
// template<typename ResPacket_>
|
||||||
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
/*
|
// {
|
||||||
dest(row + 0, col) += _acc[0];
|
// _acc *= pAlpha;
|
||||||
dest(row + 1, col) += _acc[1];
|
// }
|
||||||
dest(row + 2, col) += _acc[2];
|
|
||||||
dest(row + 3, col) += _acc[3];
|
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
*/
|
// {
|
||||||
}
|
// ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
|
||||||
};
|
// dest.template scatterPacket<ResPacket>(row, col, r);
|
||||||
|
// }
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
// };
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
|
||||||
{
|
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
using LinearMapper = typename DataMapper::LinearMapper;
|
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||||
using AccPacket = typename packet_traits<Scalar>::type;
|
// {
|
||||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
// using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
|
// using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
AccPacket _acc;
|
// using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void zero()
|
// PacketBlock<AccPacket, 4> _acc;
|
||||||
{
|
|
||||||
_acc = pset1<AccPacket>(0);
|
// EIGEN_STRONG_INLINE void zero()
|
||||||
}
|
// {
|
||||||
|
// _acc.packet[0] = pset1<AccPacket>(0);
|
||||||
template<typename ResPacket_>
|
// _acc.packet[1] = pset1<AccPacket>(0);
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
// _acc.packet[2] = pset1<AccPacket>(0);
|
||||||
{
|
// _acc.packet[3] = pset1<AccPacket>(0);
|
||||||
_acc *= pAlpha;
|
// }
|
||||||
}
|
|
||||||
|
// template<typename ResPacket_>
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
{
|
// {
|
||||||
ResPacket r = dest.template gatherPacket<ResPacket>(row, col) + _acc;
|
// _acc.packet[0] *= pAlpha;
|
||||||
dest.template scatterPacket<ResPacket>(row, col, r);
|
// _acc.packet[1] *= pAlpha;
|
||||||
}
|
// _acc.packet[2] *= pAlpha;
|
||||||
};
|
// _acc.packet[3] *= pAlpha;
|
||||||
|
// }
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
// {
|
||||||
using LinearMapper = typename DataMapper::LinearMapper;
|
// LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
||||||
using AccPacket = typename packet_traits<Scalar>::type;
|
// LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
||||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
// LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
||||||
|
// LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
||||||
PacketBlock<AccPacket, 4> _acc;
|
|
||||||
|
// r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
|
||||||
EIGEN_STRONG_INLINE void zero()
|
// r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
|
||||||
{
|
// r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
|
||||||
_acc.packet[0] = pset1<AccPacket>(0);
|
// r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
|
||||||
_acc.packet[1] = pset1<AccPacket>(0);
|
// }
|
||||||
_acc.packet[2] = pset1<AccPacket>(0);
|
// };
|
||||||
_acc.packet[3] = pset1<AccPacket>(0);
|
|
||||||
}
|
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
|
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||||
template<typename ResPacket_>
|
// {
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
// using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
{
|
// using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
_acc.packet[0] *= pAlpha;
|
// using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
_acc.packet[1] *= pAlpha;
|
|
||||||
_acc.packet[2] *= pAlpha;
|
// PacketBlock<AccPacket, 4> _acc1;
|
||||||
_acc.packet[3] *= pAlpha;
|
// PacketBlock<AccPacket, 4> _acc2;
|
||||||
}
|
|
||||||
|
// EIGEN_STRONG_INLINE void zero()
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
// {
|
||||||
{
|
// _acc1.packet[0] = pset1<AccPacket>(0);
|
||||||
LinearMapper r0 = dest.getLinearMapper(row, col + 0);
|
// _acc1.packet[1] = pset1<AccPacket>(0);
|
||||||
LinearMapper r1 = dest.getLinearMapper(row, col + 1);
|
// _acc1.packet[2] = pset1<AccPacket>(0);
|
||||||
LinearMapper r2 = dest.getLinearMapper(row, col + 2);
|
// _acc1.packet[3] = pset1<AccPacket>(0);
|
||||||
LinearMapper r3 = dest.getLinearMapper(row, col + 3);
|
|
||||||
|
// _acc2.packet[0] = pset1<AccPacket>(0);
|
||||||
r0.storePacket(0, r0.template loadPacket<ResPacket>(0) + _acc.packet[0]);
|
// _acc2.packet[1] = pset1<AccPacket>(0);
|
||||||
r1.storePacket(0, r1.template loadPacket<ResPacket>(0) + _acc.packet[1]);
|
// _acc2.packet[2] = pset1<AccPacket>(0);
|
||||||
r2.storePacket(0, r2.template loadPacket<ResPacket>(0) + _acc.packet[2]);
|
// _acc2.packet[3] = pset1<AccPacket>(0);
|
||||||
r3.storePacket(0, r3.template loadPacket<ResPacket>(0) + _acc.packet[3]);
|
// }
|
||||||
}
|
|
||||||
};
|
// template<typename ResPacket_>
|
||||||
|
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
// {
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
// _acc1.packet[0] *= pAlpha;
|
||||||
{
|
// _acc1.packet[1] *= pAlpha;
|
||||||
using LinearMapper = typename DataMapper::LinearMapper;
|
// _acc1.packet[2] *= pAlpha;
|
||||||
using AccPacket = typename packet_traits<Scalar>::type;
|
// _acc1.packet[3] *= pAlpha;
|
||||||
using ResPacket = typename packet_traits<ResScalar>::type;
|
|
||||||
|
// _acc2.packet[0] *= pAlpha;
|
||||||
PacketBlock<AccPacket, 4> _acc1;
|
// _acc2.packet[1] *= pAlpha;
|
||||||
PacketBlock<AccPacket, 4> _acc2;
|
// _acc2.packet[2] *= pAlpha;
|
||||||
|
// _acc2.packet[3] *= pAlpha;
|
||||||
EIGEN_STRONG_INLINE void zero()
|
// }
|
||||||
{
|
|
||||||
_acc1.packet[0] = pset1<AccPacket>(0);
|
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
_acc1.packet[1] = pset1<AccPacket>(0);
|
// {
|
||||||
_acc1.packet[2] = pset1<AccPacket>(0);
|
// LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
||||||
_acc1.packet[3] = pset1<AccPacket>(0);
|
// LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
||||||
|
// LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
||||||
_acc2.packet[0] = pset1<AccPacket>(0);
|
// LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
||||||
_acc2.packet[1] = pset1<AccPacket>(0);
|
|
||||||
_acc2.packet[2] = pset1<AccPacket>(0);
|
// LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
||||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
// LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
||||||
}
|
// LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
||||||
|
// LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
||||||
template<typename ResPacket_>
|
|
||||||
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
|
||||||
{
|
// r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
||||||
_acc1.packet[0] *= pAlpha;
|
// r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
||||||
_acc1.packet[1] *= pAlpha;
|
// r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
||||||
_acc1.packet[2] *= pAlpha;
|
// r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
||||||
_acc1.packet[3] *= pAlpha;
|
|
||||||
|
// r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
||||||
_acc2.packet[0] *= pAlpha;
|
// r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
||||||
_acc2.packet[1] *= pAlpha;
|
// r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
||||||
_acc2.packet[2] *= pAlpha;
|
// r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
||||||
_acc2.packet[3] *= pAlpha;
|
// }
|
||||||
}
|
// };
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
// #define MICRO_8x1x4() \
|
||||||
{
|
// pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
// lhsPackMap.advance(4*1); \
|
||||||
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
// pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
// pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
// pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
|
// pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
// pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
// pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
// acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||||
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
// acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||||
|
// acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||||
|
// acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||||
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
// acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||||
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
// acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||||
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
// acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||||
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
// acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||||
|
// lhsPackMap.advance(4*1); \
|
||||||
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
// rhsPackMap.advance(1*4);
|
||||||
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
|
||||||
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
||||||
}
|
// {
|
||||||
};
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
// RhsPackMap& rhsPackMap,
|
||||||
#define MICRO_8x1x4() \
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
// Accumulator& acc)
|
||||||
lhsPackMap.advance(4*1); \
|
// {
|
||||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
|
||||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
||||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
|
||||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
// LhsPacket pLhs, pLhs2;
|
||||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
|
||||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
// MICRO_8x1x4();
|
||||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
// MICRO_8x1x4();
|
||||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
// MICRO_8x1x4();
|
||||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
// MICRO_8x1x4();
|
||||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
// MICRO_8x1x4();
|
||||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
// MICRO_8x1x4();
|
||||||
lhsPackMap.advance(4*1); \
|
// MICRO_8x1x4();
|
||||||
rhsPackMap.advance(1*4);
|
// MICRO_8x1x4();
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
// };
|
||||||
{
|
// };
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
|
||||||
RhsPackMap& rhsPackMap,
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
|
||||||
Accumulator& acc)
|
// {
|
||||||
{
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// RhsPackMap& rhsPackMap,
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
// Accumulator& acc)
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x8x4\n\t");
|
// {
|
||||||
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
LhsPacket pLhs, pLhs2;
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
|
||||||
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
|
||||||
MICRO_8x1x4();
|
|
||||||
MICRO_8x1x4();
|
// LhsPacket pLhs, pLhs2;
|
||||||
MICRO_8x1x4();
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
MICRO_8x1x4();
|
|
||||||
MICRO_8x1x4();
|
// prefetch(lhsPackMap.pCur + 2*32);
|
||||||
MICRO_8x1x4();
|
// prefetch(rhsPackMap.pCur + 2*16);
|
||||||
MICRO_8x1x4();
|
// MICRO_8x1x4();
|
||||||
MICRO_8x1x4();
|
// MICRO_8x1x4();
|
||||||
|
// MICRO_8x1x4();
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_8x8x4\n\t");
|
// MICRO_8x1x4();
|
||||||
};
|
|
||||||
};
|
// asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
|
||||||
|
// };
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// };
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 4, 4>
|
|
||||||
{
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
|
||||||
RhsPackMap& rhsPackMap,
|
// {
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
Accumulator& acc)
|
// RhsPackMap& rhsPackMap,
|
||||||
{
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// Accumulator& acc)
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// {
|
||||||
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x4x4\n\t");
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
LhsPacket pLhs, pLhs2;
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
|
||||||
|
// LhsPacket pLhs, pLhs2;
|
||||||
MICRO_8x1x4();
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
MICRO_8x1x4();
|
|
||||||
MICRO_8x1x4();
|
// MICRO_8x1x4();
|
||||||
MICRO_8x1x4();
|
|
||||||
|
// asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_8x4x4\n\t");
|
// };
|
||||||
};
|
// };
|
||||||
};
|
|
||||||
|
// #define MICRO_4x1x4() \
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
|
// pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
{
|
// pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
RhsPackMap& rhsPackMap,
|
// pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
Accumulator& acc)
|
// acc._acc.packet[0] += pLhs*pRhs0; \
|
||||||
{
|
// acc._acc.packet[1] += pLhs*pRhs1; \
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// acc._acc.packet[2] += pLhs*pRhs2; \
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// acc._acc.packet[3] += pLhs*pRhs3; \
|
||||||
|
// lhsPackMap.advance(4*1); \
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
// rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
LhsPacket pLhs, pLhs2;
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
||||||
|
// {
|
||||||
MICRO_8x1x4();
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
// RhsPackMap& rhsPackMap,
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
};
|
// Accumulator& acc)
|
||||||
};
|
// {
|
||||||
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
#define MICRO_4x1x4() \
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
|
||||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
// LhsPacket pLhs;
|
||||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
|
||||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
// MICRO_4x1x4();
|
||||||
acc._acc.packet[0] += pLhs*pRhs0; \
|
// MICRO_4x1x4();
|
||||||
acc._acc.packet[1] += pLhs*pRhs1; \
|
// MICRO_4x1x4();
|
||||||
acc._acc.packet[2] += pLhs*pRhs2; \
|
// MICRO_4x1x4();
|
||||||
acc._acc.packet[3] += pLhs*pRhs3; \
|
// MICRO_4x1x4();
|
||||||
lhsPackMap.advance(4*1); \
|
// MICRO_4x1x4();
|
||||||
rhsPackMap.advance(1*4);
|
// MICRO_4x1x4();
|
||||||
|
// MICRO_4x1x4();
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
{
|
// };
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// };
|
||||||
RhsPackMap& rhsPackMap,
|
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
Accumulator& acc)
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
|
||||||
{
|
// {
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// RhsPackMap& rhsPackMap,
|
||||||
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
// Accumulator& acc)
|
||||||
LhsPacket pLhs;
|
// {
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
MICRO_4x1x4();
|
|
||||||
MICRO_4x1x4();
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
MICRO_4x1x4();
|
// LhsPacket pLhs;
|
||||||
MICRO_4x1x4();
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
MICRO_4x1x4();
|
|
||||||
MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
|
// MICRO_4x1x4();
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
};
|
// };
|
||||||
};
|
// };
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 4, 4>
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
||||||
{
|
// {
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
RhsPackMap& rhsPackMap,
|
// RhsPackMap& rhsPackMap,
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
Accumulator& acc)
|
// Accumulator& acc)
|
||||||
{
|
// {
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x4x4\n\t");
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
|
||||||
LhsPacket pLhs;
|
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
// LhsPacket pLhs;
|
||||||
|
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
MICRO_4x1x4();
|
|
||||||
MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
MICRO_4x1x4();
|
|
||||||
MICRO_4x1x4();
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
// };
|
||||||
};
|
// };
|
||||||
};
|
|
||||||
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4>
|
// {
|
||||||
{
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// RhsPackMap& rhsPackMap,
|
||||||
RhsPackMap& rhsPackMap,
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// Accumulator& acc)
|
||||||
Accumulator& acc)
|
// {
|
||||||
{
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
|
||||||
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t");
|
|
||||||
|
// LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
LhsPacket pLhs;
|
// RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
||||||
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
|
||||||
|
// acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
||||||
MICRO_4x1x4();
|
// lhsPackMap.advance(4*1);
|
||||||
|
// pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t");
|
// acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
||||||
};
|
|
||||||
};
|
// lhsPackMap.advance(4*1);
|
||||||
|
// rhsPackMap.advance(1);
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
|
// };
|
||||||
{
|
// };
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
|
||||||
RhsPackMap& rhsPackMap,
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
|
||||||
Accumulator& acc)
|
// {
|
||||||
{
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// RhsPackMap& rhsPackMap,
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
// Accumulator& acc)
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
// {
|
||||||
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
|
||||||
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
|
||||||
lhsPackMap.advance(4*1);
|
// LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
// RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
||||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
|
||||||
|
// //acc._acc += pRhs*pLhs;
|
||||||
lhsPackMap.advance(4*1);
|
// acc._acc = pmadd(pRhs, pLhs, acc._acc);
|
||||||
rhsPackMap.advance(1);
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
// lhsPackMap.advance(4*1);
|
||||||
};
|
// rhsPackMap.advance(1);
|
||||||
};
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
// };
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// };
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
|
|
||||||
{
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
|
||||||
RhsPackMap& rhsPackMap,
|
// {
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
Accumulator& acc)
|
// RhsPackMap& rhsPackMap,
|
||||||
{
|
// Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
// Accumulator& acc)
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
// {
|
||||||
|
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
|
||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
|
||||||
|
// LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
|
||||||
//acc._acc += pRhs*pLhs;
|
// RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
|
||||||
acc._acc = pmadd(pRhs, pLhs, acc._acc);
|
|
||||||
|
// acc._acc += pLhs*pRhs;
|
||||||
lhsPackMap.advance(4*1);
|
|
||||||
rhsPackMap.advance(1);
|
// lhsPackMap.advance(1);
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
// rhsPackMap.advance(4*1);
|
||||||
};
|
// asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
|
||||||
};
|
// };
|
||||||
|
// };
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4>
|
|
||||||
{
|
|
||||||
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
|
||||||
RhsPackMap& rhsPackMap,
|
|
||||||
Index rowIdx, Index colIdx, Index depthIdx,
|
|
||||||
Accumulator& acc)
|
|
||||||
{
|
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
|
||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
|
||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t");
|
|
||||||
|
|
||||||
LhsPacket pLhs = pset1<LhsPacket>(*lhsPackMap.pCur);
|
|
||||||
RhsPacket pRhs = pload<RhsPacket>(rhsPackMap.pCur);
|
|
||||||
|
|
||||||
acc._acc += pLhs*pRhs;
|
|
||||||
|
|
||||||
lhsPackMap.advance(1);
|
|
||||||
rhsPackMap.advance(4*1);
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -321,12 +321,14 @@ struct LhsLoopStruct
|
|||||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||||
{
|
{
|
||||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
||||||
|
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
||||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
||||||
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
||||||
{
|
{
|
||||||
lhsPackMap.moveTo(rowIdx);
|
lhsPackMap.moveTo(rowIdx);
|
||||||
rhsPackMap.moveTo(colIdx);
|
rhsPackMap.moveTo(colIdx);
|
||||||
|
//prefetch(lhsPackMap.pCur + 2*lhsProgress);
|
||||||
|
//prefetch(rhsPackMap.pCur + 2*rhsProgress);
|
||||||
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
depthLS(rowIdx, colIdx, 0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||||
}
|
}
|
||||||
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
lhsLS(rowIdx, colIdx, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||||
@ -403,7 +405,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const LhsScalar* blockA, co
|
|||||||
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
rhsLS(0, res, rows, depth, cols, alpha, pAlpha, lhsPackMap, rhsPackMap);
|
||||||
asm __volatile__("#END_GEBP\n\t");
|
asm __volatile__("#END_GEBP\n\t");
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
|
||||||
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
|
||||||
{
|
{
|
||||||
@ -459,7 +461,7 @@ void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Con
|
|||||||
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
|
lhs_pack<0, 0, Index, float, DataMapper, Conjugate, PanelMode, ColMajor> pack;
|
||||||
pack(blockA, lhs, depth, rows, stride, offset);
|
pack(blockA, lhs, depth, rows, stride, offset);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
|
||||||
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
|
struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
|
||||||
{
|
{
|
||||||
|
@ -14,6 +14,54 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
|
||||||
|
struct PackMap<0, CPU, Index, Scalar, DataMapper, isLhs>
|
||||||
|
{
|
||||||
|
const Scalar *pBase;
|
||||||
|
const Scalar *pCur;
|
||||||
|
Index stride;
|
||||||
|
Index offset;
|
||||||
|
Index d2Size;
|
||||||
|
|
||||||
|
Index shift;
|
||||||
|
Index jump;
|
||||||
|
|
||||||
|
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset)
|
||||||
|
{
|
||||||
|
shift = (d2Size / 4) * 4;
|
||||||
|
jump = shift;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void moveTo(Index p1)
|
||||||
|
{
|
||||||
|
Index offset;
|
||||||
|
if(isLhs)
|
||||||
|
{
|
||||||
|
if(p1 >= shift)
|
||||||
|
{
|
||||||
|
offset = static_cast<Index>(shift*d2Size + ((p1%4))*d2Size);
|
||||||
|
jump = 1;
|
||||||
|
} else {
|
||||||
|
offset = p1;
|
||||||
|
jump = shift;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
offset = static_cast<Index>(4*d2Size*(p1/4));
|
||||||
|
pCur = pBase + offset;
|
||||||
|
}
|
||||||
|
pCur = pBase + offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void advance(int progress)
|
||||||
|
{
|
||||||
|
Index offset = static_cast<Index>(isLhs ? jump : progress);
|
||||||
|
pCur += offset;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
template<int CPU, typename Scalar, bool isLhs>
|
template<int CPU, typename Scalar, bool isLhs>
|
||||||
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
|
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
|
||||||
|
|
||||||
@ -180,7 +228,7 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user