mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-26 06:44:27 +08:00
WIP with tests
This commit is contained in:
parent
54f80f442d
commit
a8ec6d6a36
@ -14,52 +14,135 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
// template<int CPU, typename LhsScalar, typename RhsScalar>
|
#ifdef __ENABLE_VECTOR_KERNELS__
|
||||||
// constexpr int SHAPES_COUNT<0, CPU, LhsScalar, RhsScalar> = 9;
|
|
||||||
|
|
||||||
// template<int CPU, typename LhsScalar, typename RhsScalar>
|
#define MICRO_12x1x4() \
|
||||||
// constexpr int SHAPES<0, CPU, LhsScalar, RhsScalar>[SHAPES_COUNT<0, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
// { /*0*/ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
lhsPackMap.advance(4*1); \
|
||||||
// /*1*/ {4,1,1, 0, 0, SHAPES_POINTER_END},
|
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
// /*2*/ {8,1,1, 0, 1, SHAPES_POINTER_END},
|
lhsPackMap.advance(4*1); \
|
||||||
// /*3*/ {1,1,4, 2, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
// /*4*/ {4,1,4, 2, 2, SHAPES_POINTER_END},
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
// /*5*/ {4,4,4, 2, 2, 3},
|
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
// /*6*/ {4,8,4, 2, 2, 4},
|
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
// /*7*/ {8,1,4, 2, 4, SHAPES_POINTER_END},
|
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
// /*8*/ {8,4,4, 2, 4, 6}};
|
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
|
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||||
|
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||||
|
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||||
|
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||||
|
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||||
|
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||||
|
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||||
|
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||||
|
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
||||||
|
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
||||||
|
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
||||||
|
acc._acc3.packet[3] += pLhs3*pRhs3; \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
#define MICRO_8x1x4() \
|
||||||
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
// {
|
lhsPackMap.advance(4*1); \
|
||||||
// using LinearMapper = typename DataMapper::LinearMapper;
|
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
// using AccPacket = typename packet_traits<Scalar>::type;
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
// using ResPacket = typename packet_traits<ResScalar>::type;
|
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
|
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
|
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
|
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
|
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||||
|
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||||
|
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||||
|
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||||
|
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||||
|
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||||
|
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||||
|
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
// PacketBlock<AccPacket,2> _acc;
|
#define MICRO_4x1x4() \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
|
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
|
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
|
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
|
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
|
acc._acc.packet[0] += pLhs*pRhs0; \
|
||||||
|
acc._acc.packet[1] += pLhs*pRhs1; \
|
||||||
|
acc._acc.packet[2] += pLhs*pRhs2; \
|
||||||
|
acc._acc.packet[3] += pLhs*pRhs3; \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
// EIGEN_STRONG_INLINE void zero()
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
// {
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||||
// _acc.packet[0] = pset1<AccPacket>(0);
|
{
|
||||||
// _acc.packet[1] = pset1<AccPacket>(0);
|
using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
// }
|
using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
|
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
// template<typename ResPacket_>
|
PacketBlock<AccPacket,3> _acc;
|
||||||
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
|
||||||
// {
|
|
||||||
// _acc.packet[0] *= pAlpha;
|
|
||||||
// _acc.packet[1] *= pAlpha;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void zero()
|
||||||
// {
|
{
|
||||||
// PacketBlock<ResPacket, 1> block;
|
_acc.packet[0] = pset1<AccPacket>(0);
|
||||||
// block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
_acc.packet[1] = pset1<AccPacket>(0);
|
||||||
// dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
_acc.packet[2] = pset1<AccPacket>(0);
|
||||||
// block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
}
|
||||||
// dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
|
||||||
// }
|
template<typename ResPacket_>
|
||||||
// };
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
|
{
|
||||||
|
_acc.packet[0] *= pAlpha;
|
||||||
|
_acc.packet[1] *= pAlpha;
|
||||||
|
_acc.packet[2] *= pAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
PacketBlock<ResPacket, 1> block;
|
||||||
|
block.packet[0] = dest.template loadPacket<ResPacket>(row + 0, col) + _acc.packet[0];
|
||||||
|
dest.template storePacketBlock<AccPacket, 1>(row + 0, col, block);
|
||||||
|
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||||
|
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||||
|
block.packet[0] = dest.template loadPacket<ResPacket>(row + 8, col) + _acc.packet[2];
|
||||||
|
dest.template storePacketBlock<AccPacket, 1>(row + 8, col, block);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
||||||
|
{
|
||||||
|
using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
|
using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
|
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
|
PacketBlock<AccPacket,2> _acc;
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void zero()
|
||||||
|
{
|
||||||
|
_acc.packet[0] = pset1<AccPacket>(0);
|
||||||
|
_acc.packet[1] = pset1<AccPacket>(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ResPacket_>
|
||||||
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
|
{
|
||||||
|
_acc.packet[0] *= pAlpha;
|
||||||
|
_acc.packet[1] *= pAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
PacketBlock<ResPacket, 1> block;
|
||||||
|
block.packet[0] = dest.template loadPacket<ResPacket>(row, col) + _acc.packet[0];
|
||||||
|
dest.template storePacketBlock<AccPacket, 1>(row, col, block);
|
||||||
|
block.packet[0] = dest.template loadPacket<ResPacket>(row + 4, col) + _acc.packet[1];
|
||||||
|
dest.template storePacketBlock<AccPacket, 1>(row + 4, col, block);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
||||||
@ -156,87 +239,150 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
||||||
// {
|
{
|
||||||
// using LinearMapper = typename DataMapper::LinearMapper;
|
using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
// using AccPacket = typename packet_traits<Scalar>::type;
|
using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
// using ResPacket = typename packet_traits<ResScalar>::type;
|
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
// PacketBlock<AccPacket, 4> _acc1;
|
PacketBlock<AccPacket, 4> _acc1;
|
||||||
// PacketBlock<AccPacket, 4> _acc2;
|
PacketBlock<AccPacket, 4> _acc2;
|
||||||
|
|
||||||
// EIGEN_STRONG_INLINE void zero()
|
EIGEN_STRONG_INLINE void zero()
|
||||||
// {
|
{
|
||||||
// _acc1.packet[0] = pset1<AccPacket>(0);
|
_acc1.packet[0] = pset1<AccPacket>(0);
|
||||||
// _acc1.packet[1] = pset1<AccPacket>(0);
|
_acc1.packet[1] = pset1<AccPacket>(0);
|
||||||
// _acc1.packet[2] = pset1<AccPacket>(0);
|
_acc1.packet[2] = pset1<AccPacket>(0);
|
||||||
// _acc1.packet[3] = pset1<AccPacket>(0);
|
_acc1.packet[3] = pset1<AccPacket>(0);
|
||||||
|
|
||||||
// _acc2.packet[0] = pset1<AccPacket>(0);
|
_acc2.packet[0] = pset1<AccPacket>(0);
|
||||||
// _acc2.packet[1] = pset1<AccPacket>(0);
|
_acc2.packet[1] = pset1<AccPacket>(0);
|
||||||
// _acc2.packet[2] = pset1<AccPacket>(0);
|
_acc2.packet[2] = pset1<AccPacket>(0);
|
||||||
// _acc2.packet[3] = pset1<AccPacket>(0);
|
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||||
// }
|
}
|
||||||
|
|
||||||
// template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
// {
|
{
|
||||||
// _acc1.packet[0] *= pAlpha;
|
_acc1.packet[0] *= pAlpha;
|
||||||
// _acc1.packet[1] *= pAlpha;
|
_acc1.packet[1] *= pAlpha;
|
||||||
// _acc1.packet[2] *= pAlpha;
|
_acc1.packet[2] *= pAlpha;
|
||||||
// _acc1.packet[3] *= pAlpha;
|
_acc1.packet[3] *= pAlpha;
|
||||||
|
|
||||||
// _acc2.packet[0] *= pAlpha;
|
_acc2.packet[0] *= pAlpha;
|
||||||
// _acc2.packet[1] *= pAlpha;
|
_acc2.packet[1] *= pAlpha;
|
||||||
// _acc2.packet[2] *= pAlpha;
|
_acc2.packet[2] *= pAlpha;
|
||||||
// _acc2.packet[3] *= pAlpha;
|
_acc2.packet[3] *= pAlpha;
|
||||||
// }
|
}
|
||||||
|
|
||||||
// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
// {
|
{
|
||||||
// LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
||||||
// LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
||||||
// LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
||||||
// LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
||||||
|
|
||||||
// LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
||||||
// LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
||||||
// LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
||||||
// LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
||||||
|
|
||||||
|
|
||||||
// r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
||||||
// r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
||||||
// r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
||||||
// r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
||||||
|
|
||||||
// r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
||||||
// r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
||||||
// r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
||||||
// r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
||||||
// }
|
}
|
||||||
// };
|
};
|
||||||
|
|
||||||
// #define MICRO_8x1x4() \
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
// pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
||||||
// lhsPackMap.advance(4*1); \
|
{
|
||||||
// pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
// pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
using AccPacket = typename packet_traits<Scalar>::type;
|
||||||
// pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
// pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
|
||||||
// pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
PacketBlock<AccPacket, 4> _acc1;
|
||||||
// pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
PacketBlock<AccPacket, 4> _acc2;
|
||||||
// acc._acc1.packet[0] += pLhs*pRhs0; \
|
PacketBlock<AccPacket, 4> _acc3;
|
||||||
// acc._acc1.packet[1] += pLhs*pRhs1; \
|
|
||||||
// acc._acc1.packet[2] += pLhs*pRhs2; \
|
EIGEN_STRONG_INLINE void zero()
|
||||||
// acc._acc1.packet[3] += pLhs*pRhs3; \
|
{
|
||||||
// acc._acc2.packet[0] += pLhs2*pRhs0; \
|
_acc1.packet[0] = pset1<AccPacket>(0);
|
||||||
// acc._acc2.packet[1] += pLhs2*pRhs1; \
|
_acc1.packet[1] = pset1<AccPacket>(0);
|
||||||
// acc._acc2.packet[2] += pLhs2*pRhs2; \
|
_acc1.packet[2] = pset1<AccPacket>(0);
|
||||||
// acc._acc2.packet[3] += pLhs2*pRhs3; \
|
_acc1.packet[3] = pset1<AccPacket>(0);
|
||||||
// lhsPackMap.advance(4*1); \
|
|
||||||
// rhsPackMap.advance(1*4);
|
_acc2.packet[0] = pset1<AccPacket>(0);
|
||||||
|
_acc2.packet[1] = pset1<AccPacket>(0);
|
||||||
|
_acc2.packet[2] = pset1<AccPacket>(0);
|
||||||
|
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||||
|
|
||||||
|
_acc3.packet[0] = pset1<AccPacket>(0);
|
||||||
|
_acc3.packet[1] = pset1<AccPacket>(0);
|
||||||
|
_acc3.packet[2] = pset1<AccPacket>(0);
|
||||||
|
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ResPacket_>
|
||||||
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
|
{
|
||||||
|
_acc1.packet[0] *= pAlpha;
|
||||||
|
_acc1.packet[1] *= pAlpha;
|
||||||
|
_acc1.packet[2] *= pAlpha;
|
||||||
|
_acc1.packet[3] *= pAlpha;
|
||||||
|
|
||||||
|
_acc2.packet[0] *= pAlpha;
|
||||||
|
_acc2.packet[1] *= pAlpha;
|
||||||
|
_acc2.packet[2] *= pAlpha;
|
||||||
|
_acc2.packet[3] *= pAlpha;
|
||||||
|
|
||||||
|
_acc3.packet[0] *= pAlpha;
|
||||||
|
_acc3.packet[1] *= pAlpha;
|
||||||
|
_acc3.packet[2] *= pAlpha;
|
||||||
|
_acc3.packet[3] *= pAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
LinearMapper r00 = dest.getLinearMapper(row + 0, col + 0);
|
||||||
|
LinearMapper r01 = dest.getLinearMapper(row + 0, col + 1);
|
||||||
|
LinearMapper r02 = dest.getLinearMapper(row + 0, col + 2);
|
||||||
|
LinearMapper r03 = dest.getLinearMapper(row + 0, col + 3);
|
||||||
|
|
||||||
|
LinearMapper r10 = dest.getLinearMapper(row + 4, col + 0);
|
||||||
|
LinearMapper r11 = dest.getLinearMapper(row + 4, col + 1);
|
||||||
|
LinearMapper r12 = dest.getLinearMapper(row + 4, col + 2);
|
||||||
|
LinearMapper r13 = dest.getLinearMapper(row + 4, col + 3);
|
||||||
|
|
||||||
|
LinearMapper r20 = dest.getLinearMapper(row + 12, col + 0);
|
||||||
|
LinearMapper r21 = dest.getLinearMapper(row + 12, col + 1);
|
||||||
|
LinearMapper r22 = dest.getLinearMapper(row + 12, col + 2);
|
||||||
|
LinearMapper r23 = dest.getLinearMapper(row + 12, col + 3);
|
||||||
|
|
||||||
|
|
||||||
|
r00.storePacket(0, r00.template loadPacket<ResPacket>(0) + _acc1.packet[0]);
|
||||||
|
r01.storePacket(0, r01.template loadPacket<ResPacket>(0) + _acc1.packet[1]);
|
||||||
|
r02.storePacket(0, r02.template loadPacket<ResPacket>(0) + _acc1.packet[2]);
|
||||||
|
r03.storePacket(0, r03.template loadPacket<ResPacket>(0) + _acc1.packet[3]);
|
||||||
|
|
||||||
|
r10.storePacket(0, r10.template loadPacket<ResPacket>(0) + _acc2.packet[0]);
|
||||||
|
r11.storePacket(0, r11.template loadPacket<ResPacket>(0) + _acc2.packet[1]);
|
||||||
|
r12.storePacket(0, r12.template loadPacket<ResPacket>(0) + _acc2.packet[2]);
|
||||||
|
r13.storePacket(0, r13.template loadPacket<ResPacket>(0) + _acc2.packet[3]);
|
||||||
|
|
||||||
|
r20.storePacket(0, r20.template loadPacket<ResPacket>(0) + _acc3.packet[0]);
|
||||||
|
r21.storePacket(0, r21.template loadPacket<ResPacket>(0) + _acc3.packet[1]);
|
||||||
|
r22.storePacket(0, r22.template loadPacket<ResPacket>(0) + _acc3.packet[2]);
|
||||||
|
r23.storePacket(0, r23.template loadPacket<ResPacket>(0) + _acc3.packet[3]);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 8, 4>
|
||||||
@ -294,41 +440,49 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
// };
|
// };
|
||||||
// };
|
// };
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 4>
|
||||||
// {
|
{
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
// RhsPackMap& rhsPackMap,
|
RhsPackMap& rhsPackMap,
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
// Accumulator& acc)
|
Accumulator& acc)
|
||||||
// {
|
{
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
|
|
||||||
// LhsPacket pLhs, pLhs2;
|
LhsPacket pLhs, pLhs2, pLhs3;
|
||||||
// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
// MICRO_8x1x4();
|
MICRO_12x1x4();
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
// };
|
};
|
||||||
// };
|
};
|
||||||
|
|
||||||
#define MICRO_4x1x4() \
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 4>
|
||||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
{
|
||||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
RhsPackMap& rhsPackMap,
|
||||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
Accumulator& acc)
|
||||||
acc._acc.packet[0] += pLhs*pRhs0; \
|
{
|
||||||
acc._acc.packet[1] += pLhs*pRhs1; \
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
acc._acc.packet[2] += pLhs*pRhs2; \
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
acc._acc.packet[3] += pLhs*pRhs3; \
|
|
||||||
lhsPackMap.advance(4*1); \
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
rhsPackMap.advance(1*4);
|
|
||||||
|
LhsPacket pLhs, pLhs2;
|
||||||
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
|
MICRO_8x1x4();
|
||||||
|
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_8x1x4\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4>
|
||||||
@ -377,6 +531,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
// MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
// MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
// MICRO_4x1x4();
|
// MICRO_4x1x4();
|
||||||
|
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
// asm __volatile__("#END_NEON_MICROKERNEL_4x4x4\n\t");
|
||||||
// };
|
// };
|
||||||
// };
|
// };
|
||||||
@ -403,32 +558,62 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
// template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
|
||||||
// {
|
{
|
||||||
// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
// RhsPackMap& rhsPackMap,
|
RhsPackMap& rhsPackMap,
|
||||||
// Index rowIdx, Index colIdx, Index depthIdx,
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
// Accumulator& acc)
|
Accumulator& acc)
|
||||||
// {
|
{
|
||||||
// using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
// using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
// LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
// RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
||||||
|
|
||||||
// acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
||||||
// lhsPackMap.advance(4*1);
|
lhsPackMap.advance(4*1);
|
||||||
// pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
// acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
||||||
|
lhsPackMap.advance(4*1);
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
|
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]);
|
||||||
|
|
||||||
// lhsPackMap.advance(4*1);
|
lhsPackMap.advance(4*1);
|
||||||
// rhsPackMap.advance(1);
|
rhsPackMap.advance(1);
|
||||||
// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
// };
|
};
|
||||||
// };
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
|
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
|
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
||||||
|
|
||||||
|
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
||||||
|
lhsPackMap.advance(4*1);
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
|
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
||||||
|
|
||||||
|
lhsPackMap.advance(4*1);
|
||||||
|
rhsPackMap.advance(1);
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1>
|
||||||
@ -446,8 +631,7 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
||||||
|
|
||||||
//acc._acc += pRhs*pLhs;
|
acc._acc += pRhs*pLhs;
|
||||||
acc._acc = pmadd(pRhs, pLhs, acc._acc);
|
|
||||||
|
|
||||||
lhsPackMap.advance(4*1);
|
lhsPackMap.advance(4*1);
|
||||||
rhsPackMap.advance(1);
|
rhsPackMap.advance(1);
|
||||||
@ -478,6 +662,9 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // __ENABLE_VECTOR_KERNELS__
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -19,7 +19,7 @@ namespace Eigen {
|
|||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES_COUNT = 4;
|
constexpr int SHAPES_COUNT = 8;
|
||||||
|
|
||||||
constexpr int SHAPES_DIMENSION = 6;
|
constexpr int SHAPES_DIMENSION = 6;
|
||||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||||
@ -32,6 +32,10 @@ constexpr int SHAPES_POINTER_END = -1;
|
|||||||
|
|
||||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||||
constexpr int PACK_SHAPES_COUNT = 2;
|
constexpr int PACK_SHAPES_COUNT = 2;
|
||||||
|
|
||||||
|
template<int Architecture, int CPU, typename Scalar>
|
||||||
|
constexpr int PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true> = 4;
|
||||||
|
|
||||||
constexpr int PACK_SHAPES_DIMENSION = 3;
|
constexpr int PACK_SHAPES_DIMENSION = 3;
|
||||||
constexpr int PACK_SHAPES_POINTER = 2;
|
constexpr int PACK_SHAPES_POINTER = 2;
|
||||||
constexpr int PACK_SHAPES_END = -1;
|
constexpr int PACK_SHAPES_END = -1;
|
||||||
@ -39,14 +43,27 @@ constexpr int PACK_SHAPES_END = -1;
|
|||||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||||
{ {1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
{ /* 0 */{ 1,1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
{4,1,1, 0, 0, SHAPES_POINTER_END},
|
/* 1 */{1*packet_traits<RhsScalar>::size,1,1, 0, 0, SHAPES_POINTER_END},
|
||||||
{1,1,4, 1, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
/* 2 */{2*packet_traits<RhsScalar>::size,1,1, 0, 1, SHAPES_POINTER_END},
|
||||||
{4,1,4, 1, 2, SHAPES_POINTER_END}};
|
/* 3 */{3*packet_traits<RhsScalar>::size,1,1, 0, 2, SHAPES_POINTER_END},
|
||||||
|
/* 4 */{ 1,1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
|
/* 5 */{1*packet_traits<RhsScalar>::size,1,4, 3, 4, SHAPES_POINTER_END},
|
||||||
|
/* 6 */{2*packet_traits<RhsScalar>::size,1,4, 3, 5, SHAPES_POINTER_END},
|
||||||
|
/* 7 */{3*packet_traits<RhsScalar>::size,1,4, 3, 6, SHAPES_POINTER_END}};
|
||||||
|
|
||||||
// d1progress x d2progress
|
// d1progress x d2progress
|
||||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||||
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] = {{1,1,PACK_SHAPES_END},{4,1,0}};
|
constexpr int PACK_SHAPES[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, isLhs>][PACK_SHAPES_DIMENSION] =
|
||||||
|
{{ 1, 1, PACK_SHAPES_END},
|
||||||
|
{ 4, 1, 0}};
|
||||||
|
|
||||||
|
template<int Architecture, int CPU, typename Scalar>
|
||||||
|
constexpr int PACK_SHAPES<Architecture, CPU, Scalar, true>[PACK_SHAPES_COUNT<Architecture, CPU, Scalar, true>][PACK_SHAPES_DIMENSION] =
|
||||||
|
{{ 1, 1, PACK_SHAPES_END},
|
||||||
|
{1*packet_traits<Scalar>::size, 1, 0},
|
||||||
|
{2*packet_traits<Scalar>::size, 1, 1},
|
||||||
|
{3*packet_traits<Scalar>::size, 1, 2}};
|
||||||
|
|
||||||
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
|
template<int Architecture, int CPU, typename Index, typename Scalar, bool isLhs, typename DataMapper, bool Conjugate, bool PanelMode, int StorageOrder, int M, int N>
|
||||||
struct PackingOperator
|
struct PackingOperator
|
||||||
|
@ -14,54 +14,8 @@ namespace Eigen {
|
|||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
template<int CPU, typename Index, typename Scalar, typename DataMapper, bool isLhs>
|
#ifdef __ENABLE_CUSTOM_PACKING__
|
||||||
struct PackMap<0, CPU, Index, Scalar, DataMapper, isLhs>
|
|
||||||
{
|
|
||||||
const Scalar *pBase;
|
|
||||||
const Scalar *pCur;
|
|
||||||
Index stride;
|
|
||||||
Index offset;
|
|
||||||
Index d2Size;
|
|
||||||
|
|
||||||
Index shift;
|
|
||||||
Index jump;
|
|
||||||
|
|
||||||
PackMap(const Scalar *base, Index d2Size, Index stride, Index offset) : pBase(base), pCur(base), d2Size(d2Size), stride(stride), offset(offset)
|
|
||||||
{
|
|
||||||
shift = (d2Size / 4) * 4;
|
|
||||||
jump = shift;
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void resetCur() { pCur = pBase; }
|
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void moveTo(Index p1)
|
|
||||||
{
|
|
||||||
Index offset;
|
|
||||||
if(isLhs)
|
|
||||||
{
|
|
||||||
if(p1 >= shift)
|
|
||||||
{
|
|
||||||
offset = static_cast<Index>(shift*d2Size + ((p1%4))*d2Size);
|
|
||||||
jump = 1;
|
|
||||||
} else {
|
|
||||||
offset = p1;
|
|
||||||
jump = shift;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
offset = static_cast<Index>(4*d2Size*(p1/4));
|
|
||||||
pCur = pBase + offset;
|
|
||||||
}
|
|
||||||
pCur = pBase + offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void advance(int progress)
|
|
||||||
{
|
|
||||||
Index offset = static_cast<Index>(isLhs ? jump : progress);
|
|
||||||
pCur += offset;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
template<int CPU, typename Scalar, bool isLhs>
|
template<int CPU, typename Scalar, bool isLhs>
|
||||||
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
|
constexpr int PACK_SHAPES_COUNT<0, CPU, Scalar, isLhs> = 3;
|
||||||
|
|
||||||
@ -228,7 +182,8 @@ struct PackingOperator<0, CPU, Index, Scalar, isLhs, DataMapper, Conjugate, Pane
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
*/
|
|
||||||
|
#endif // __ENABLE_CUSTOM_PACKING__
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
|
5
compile.sh
Executable file
5
compile.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
echo 'Compiling with master'
|
||||||
|
g++ -O3 -I../eigen-master -std=c++11 new_gemm_test.cpp -o gto
|
||||||
|
echo 'Compiling current'
|
||||||
|
g++ -O3 -I. -std=c++14 new_gemm_test.cpp -D__ENABLE_VECTOR_KERNELS__ -o gt
|
92
new_gemm_test.cpp
Normal file
92
new_gemm_test.cpp
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
#include <Eigen/Dense>
|
||||||
|
#include <iostream>
|
||||||
|
#include <ctime>
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
void set(MatrixXf& A, int m, int n, int id, int digits)
|
||||||
|
{
|
||||||
|
for(auto i = 0; i < m; i++)
|
||||||
|
for(auto j = 0; j < n; j++)
|
||||||
|
A(i,j) = id*std::pow(10,(2*digits)) + i*std::pow(10,digits) + j;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
#ifdef __DEBUG__
|
||||||
|
int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n);
|
||||||
|
MatrixXf A = MatrixXf::Zero(m, k);
|
||||||
|
MatrixXf B = MatrixXf::Zero(k, n);
|
||||||
|
MatrixXf C = MatrixXf::Zero(m, n);
|
||||||
|
|
||||||
|
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||||
|
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||||
|
|
||||||
|
C = A*B;
|
||||||
|
|
||||||
|
std::cout << A << std::endl;
|
||||||
|
std::cout << B << std::endl;
|
||||||
|
std::cout << C << std::endl;
|
||||||
|
|
||||||
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
for(auto i = 0; i < m; i++)
|
||||||
|
{
|
||||||
|
for(auto j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
float acc=0;
|
||||||
|
for(auto kk = 0; kk < k; kk++)
|
||||||
|
{
|
||||||
|
acc += A(i,kk)*B(kk,j);
|
||||||
|
}
|
||||||
|
C(i,j) = acc;
|
||||||
|
//std::cout << acc << " ";
|
||||||
|
}
|
||||||
|
//std::cout << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << C << std::endl;
|
||||||
|
#else
|
||||||
|
int sz = 128;
|
||||||
|
int m = sz, k = sz, n = sz;
|
||||||
|
int RUNS = 500;
|
||||||
|
double time = 0;
|
||||||
|
|
||||||
|
for(auto i = 0; i < RUNS; i++)
|
||||||
|
{
|
||||||
|
MatrixXf A = MatrixXf::Random(m,k);
|
||||||
|
MatrixXf B = MatrixXf::Random(k,n);
|
||||||
|
//set(A,m, k, 1);
|
||||||
|
//set(B,k, n, 2);
|
||||||
|
MatrixXf C = MatrixXf::Zero(m, n);
|
||||||
|
|
||||||
|
std::clock_t start,end;
|
||||||
|
start = std::clock();
|
||||||
|
C = A*B;
|
||||||
|
end = std::clock();
|
||||||
|
|
||||||
|
time += 1000.0*(end-start) / CLOCKS_PER_SEC;
|
||||||
|
}
|
||||||
|
std::cout << time << std::endl;
|
||||||
|
#ifdef TEST_SCALAR
|
||||||
|
start = std::clock();
|
||||||
|
for(auto i = 0; i < m; i++)
|
||||||
|
{
|
||||||
|
for(auto j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
float acc=0;
|
||||||
|
for(auto kk = 0; kk < k; kk++)
|
||||||
|
{
|
||||||
|
acc += A(i,kk)*B(kk,j);
|
||||||
|
}
|
||||||
|
C(i,j) = acc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
end = std::clock();
|
||||||
|
|
||||||
|
std::cout << 1000.0*(end-start) / CLOCKS_PER_SEC << std::endl;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
32
run.sh
Executable file
32
run.sh
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
echo 'Running with master'
|
||||||
|
T_OLD1=$(./gto)
|
||||||
|
echo $T_OLD1
|
||||||
|
echo 'Running current'
|
||||||
|
T_NEW1=$(./gt)
|
||||||
|
echo $T_NEW1
|
||||||
|
echo 'Running with master'
|
||||||
|
T_OLD2=$(./gto)
|
||||||
|
echo $T_OLD2
|
||||||
|
echo 'Running with master'
|
||||||
|
T_OLD3=$(./gto)
|
||||||
|
echo $T_OLD3
|
||||||
|
echo 'Running current'
|
||||||
|
T_NEW2=$(./gt)
|
||||||
|
echo $T_NEW2
|
||||||
|
echo 'Running with master'
|
||||||
|
T_OLD4=$(./gto)
|
||||||
|
echo $T_OLD4
|
||||||
|
echo 'Running current'
|
||||||
|
T_NEW3=$(./gt)
|
||||||
|
echo $T_NEW3
|
||||||
|
echo 'Running current'
|
||||||
|
T_NEW4=$(./gt)
|
||||||
|
echo $T_NEW4
|
||||||
|
echo 'Running with master'
|
||||||
|
T_OLD5=$(./gto)
|
||||||
|
echo $T_OLD5
|
||||||
|
echo 'Running current'
|
||||||
|
T_NEW5=$(./gt)
|
||||||
|
echo $T_NEW5
|
||||||
|
echo "($T_OLD1 + $T_OLD2 + $T_OLD3 + $T_OLD4 + $T_OLD5) / ($T_NEW1 + $T_NEW2 + $T_NEW3 + $T_NEW4 + $T_NEW5)" | bc -l
|
Loading…
x
Reference in New Issue
Block a user