mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-14 12:46:00 +08:00
WIP 2 - need to implement 2x1x1
This commit is contained in:
parent
029f78abf0
commit
6533187280
@ -74,6 +74,21 @@ namespace internal {
|
|||||||
lhsPackMap.advance(4*1); \
|
lhsPackMap.advance(4*1); \
|
||||||
rhsPackMap.advance(1*4);
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
|
#define MICRO_2x1x4() \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
|
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
|
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur + 2); \
|
||||||
|
pRhs2 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
|
pRhs3 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
|
acc._acc.packet[0] += pLhs*pRhs0; \
|
||||||
|
acc._acc.packet[1] += pLhs*pRhs1; \
|
||||||
|
acc._acc.packet[2] += pLhs*pRhs2; \
|
||||||
|
acc._acc.packet[3] += pLhs*pRhs3; \
|
||||||
|
lhsPackMap.advance(2*1); \
|
||||||
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
#define MICRO_12x1x1(K) \
|
#define MICRO_12x1x1(K) \
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur + (0 + 3*K)*4); \
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur + (0 + 3*K)*4); \
|
||||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur + (1 + 3*K)*4); \
|
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur + (1 + 3*K)*4); \
|
||||||
@ -116,6 +131,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
|||||||
_acc.packet[2] = pset1<AccPacket>(0);
|
_acc.packet[2] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
@ -156,6 +172,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 1>
|
|||||||
_acc.packet[1] = pset1<AccPacket>(0);
|
_acc.packet[1] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
@ -189,7 +206,8 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1>
|
|||||||
{
|
{
|
||||||
_acc = pset1<AccPacket>(0);
|
_acc = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
@ -221,6 +239,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
|||||||
_acc = pset1<AccPacket>(0);
|
_acc = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket_>
|
template<typename ResPacket_>
|
||||||
@ -237,6 +256,80 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4>
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//[TODO] Implement this properly
|
||||||
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 2, 4>
|
||||||
|
{
|
||||||
|
using LinearMapper = typename DataMapper::LinearMapper;
|
||||||
|
using AccPacket = typename packet_traits<Scalar>::half;
|
||||||
|
using ResPacket = typename packet_traits<ResScalar>::type;
|
||||||
|
|
||||||
|
LinearMapper r0{nullptr};
|
||||||
|
LinearMapper r1{nullptr};
|
||||||
|
LinearMapper r2{nullptr};
|
||||||
|
LinearMapper r3{nullptr};
|
||||||
|
|
||||||
|
PacketBlock<AccPacket, 4> _acc;
|
||||||
|
|
||||||
|
EIGEN_STRONG_INLINE void zero()
|
||||||
|
{
|
||||||
|
_acc.packet[0] = pset1<AccPacket>(0);
|
||||||
|
_acc.packet[1] = pset1<AccPacket>(0);
|
||||||
|
_acc.packet[2] = pset1<AccPacket>(0);
|
||||||
|
_acc.packet[3] = pset1<AccPacket>(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
|
{
|
||||||
|
asm __volatile__("#BEGIN_PREFETCH_2x4\n\t");
|
||||||
|
r0 = dest.getLinearMapper(row + 0, col + 0);
|
||||||
|
r1 = dest.getLinearMapper(row + 0, col + 1);
|
||||||
|
r2 = dest.getLinearMapper(row + 0, col + 2);
|
||||||
|
r3 = dest.getLinearMapper(row + 0, col + 3);
|
||||||
|
|
||||||
|
#ifdef __ENABLE_PREFETCH__
|
||||||
|
r0.prefetch(0);
|
||||||
|
r1.prefetch(0);
|
||||||
|
r2.prefetch(0);
|
||||||
|
r3.prefetch(0);
|
||||||
|
#endif
|
||||||
|
asm __volatile__("#END_PREFETCH_2x4\n\t");
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ResPacket_>
|
||||||
|
EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
|
{
|
||||||
|
// _acc.packet[0] *= pAlpha;
|
||||||
|
// _acc.packet[1] *= pAlpha;
|
||||||
|
// _acc.packet[2] *= pAlpha;
|
||||||
|
// _acc.packet[3] *= pAlpha;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ResPacket_>
|
||||||
|
EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col, ResScalar alpha, const ResPacket_& pAlpha)
|
||||||
|
{
|
||||||
|
asm __volatile__("#BEGIN_STORE_2x4\n\t");
|
||||||
|
constexpr auto PacketSize = unpacket_traits<AccPacket>::size;
|
||||||
|
AccPacket ppAlpha = pset1<AccPacket>(alpha);
|
||||||
|
AccPacket R00 = r0.template loadPacket<AccPacket>(0*PacketSize);
|
||||||
|
AccPacket R01 = r1.template loadPacket<AccPacket>(0*PacketSize);
|
||||||
|
AccPacket R02 = r2.template loadPacket<AccPacket>(0*PacketSize);
|
||||||
|
AccPacket R03 = r3.template loadPacket<AccPacket>(0*PacketSize);
|
||||||
|
|
||||||
|
R00 += ppAlpha*_acc.packet[0];
|
||||||
|
R01 += ppAlpha*_acc.packet[1];
|
||||||
|
R02 += ppAlpha*_acc.packet[2];
|
||||||
|
R03 += ppAlpha*_acc.packet[3];
|
||||||
|
|
||||||
|
r0.storePacket(0*PacketSize, R00);
|
||||||
|
r1.storePacket(0*PacketSize, R01);
|
||||||
|
r2.storePacket(0*PacketSize, R02);
|
||||||
|
r3.storePacket(0*PacketSize, R03);
|
||||||
|
asm __volatile__("#END_STORE_2x4\n\t");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
||||||
{
|
{
|
||||||
@ -259,6 +352,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4>
|
|||||||
_acc.packet[3] = pset1<AccPacket>(0);
|
_acc.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
asm __volatile__("#BEGIN_PREFETCH_4x4\n\t");
|
asm __volatile__("#BEGIN_PREFETCH_4x4\n\t");
|
||||||
@ -336,6 +430,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4>
|
|||||||
_acc2.packet[3] = pset1<AccPacket>(0);
|
_acc2.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
constexpr Index offset = 32 / sizeof(ResScalar);
|
constexpr Index offset = 32 / sizeof(ResScalar);
|
||||||
@ -437,6 +532,7 @@ struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 4>
|
|||||||
_acc3.packet[3] = pset1<AccPacket>(0);
|
_acc3.packet[3] = pset1<AccPacket>(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress, int DepthProgress, int RhsProgress>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper& dest, Index row, Index col)
|
||||||
{
|
{
|
||||||
asm __volatile__("#BEGIN_PREFETCH_12x4\n\t");
|
asm __volatile__("#BEGIN_PREFETCH_12x4\n\t");
|
||||||
@ -710,6 +806,28 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 2, 1, 4>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::half;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::half;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_2x1x4\n\t");
|
||||||
|
|
||||||
|
LhsPacket pLhs;
|
||||||
|
RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3;
|
||||||
|
|
||||||
|
MICRO_2x1x4();
|
||||||
|
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_2x1x4\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
|
||||||
{
|
{
|
||||||
|
@ -23,7 +23,7 @@ namespace internal {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES_COUNT = 14;
|
constexpr int SHAPES_COUNT = 16;
|
||||||
|
|
||||||
constexpr int SHAPES_DIMENSION = 6;
|
constexpr int SHAPES_DIMENSION = 6;
|
||||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||||
@ -44,23 +44,35 @@ constexpr int PACK_SHAPES_DIMENSION = 3;
|
|||||||
constexpr int PACK_SHAPES_POINTER = 2;
|
constexpr int PACK_SHAPES_POINTER = 2;
|
||||||
constexpr int PACK_SHAPES_END = -1;
|
constexpr int PACK_SHAPES_END = -1;
|
||||||
|
|
||||||
|
template<typename Scalar>
|
||||||
|
struct PacketMultiples
|
||||||
|
{
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
half = unpacket_traits<typename packet_traits<Scalar>::half>::size,
|
||||||
|
quarter = unpacket_traits<typename packet_traits<Scalar>::half>::size // Is this used?
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
// lhs_progress x depth_progress x rhs_progress (depth_progress > 1 matrix ops) x pointer to next rhs_progress on the shapes map
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||||
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
/* 01 */{PacketMultiples<RhsScalar>::half, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||||
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
|
/* 02 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
|
||||||
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
/* 03 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 1, 2},
|
||||||
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
|
/* 04 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 3, SHAPES_POINTER_END},
|
||||||
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
|
/* 05 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 3, 4},
|
||||||
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
|
/* 06 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 5, SHAPES_POINTER_END},
|
||||||
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
/* 07 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 5, 6},
|
||||||
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
|
/* 08 */{ 1, 1,4, 7, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
|
/* 09 */{PacketMultiples<RhsScalar>::half, 1,4, 7, 8, SHAPES_POINTER_END},
|
||||||
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
|
/* 10 */{1*packet_traits<RhsScalar>::size, 1,4, 7, 9, SHAPES_POINTER_END},
|
||||||
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
|
/* 11 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 9, 10},
|
||||||
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
|
/* 12 */{2*packet_traits<RhsScalar>::size, 1,4, 7, 11, SHAPES_POINTER_END},
|
||||||
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
|
/* 13 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 11, 12},
|
||||||
|
/* 14 */{3*packet_traits<RhsScalar>::size, 1,4, 7, 13, SHAPES_POINTER_END},
|
||||||
|
/* 15 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 7, 13, 14}};
|
||||||
|
|
||||||
// d1progress x d2progress
|
// d1progress x d2progress
|
||||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||||
@ -218,6 +230,8 @@ struct PackMap
|
|||||||
EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; }
|
EIGEN_STRONG_INLINE void updateBase() { pBase = pCur; }
|
||||||
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
|
EIGEN_STRONG_INLINE void moveTo(Index p1) { pCur = pBase + pmc.getPosition(p1, d2Size); }
|
||||||
EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; }
|
EIGEN_STRONG_INLINE void advance(Index progress) { pCur += progress; }
|
||||||
|
|
||||||
|
template<int D1Progress=-1, int D2Progress=-1>
|
||||||
EIGEN_STRONG_INLINE void prefetch(Index amnt)
|
EIGEN_STRONG_INLINE void prefetch(Index amnt)
|
||||||
{
|
{
|
||||||
#ifdef __ENABLE_PREFETCH__
|
#ifdef __ENABLE_PREFETCH__
|
||||||
@ -242,6 +256,7 @@ struct Accumulator
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<int LhsProgress=-1, int DepthProgress=-1, int RhsProgress=-1>
|
||||||
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
EIGEN_STRONG_INLINE void prefetch(const DataMapper&, Index, Index) {}
|
||||||
|
|
||||||
template<typename ResPacket>
|
template<typename ResPacket>
|
||||||
@ -321,9 +336,9 @@ struct DepthLoopStruct
|
|||||||
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
|
EIGEN_STRONG_INLINE void operator()(Index rowIdx, Index colIdx, Index depthIdx, const DataMapper& res,
|
||||||
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
Index rows, Index depth, Index cols, ResScalar alpha, const ResPacket& pAlpha, LhsPackMap& lhsPackMap, RhsPackMap& rhsPackMap)
|
||||||
{
|
{
|
||||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
|
constexpr int rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[RHS_SHAPE_IDX][SHAPES_RHS_DIMENSION];
|
||||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
constexpr int lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[LHS_SHAPE_IDX][SHAPES_LHS_DIMENSION];
|
||||||
constexpr auto depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
constexpr int depthProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_DEP_DIMENSION];
|
||||||
|
|
||||||
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
typedef Accumulator<Architecture, CPU, AccScalar, ResScalar, DataMapper, lhsProgress, rhsProgress> AccumulatorType;
|
||||||
|
|
||||||
@ -332,11 +347,10 @@ struct DepthLoopStruct
|
|||||||
|
|
||||||
acc.zero();
|
acc.zero();
|
||||||
|
|
||||||
acc.prefetch(res, rowIdx, colIdx);
|
acc.template prefetch<lhsProgress, depthProgress, rhsProgress>(res, rowIdx, colIdx);
|
||||||
|
|
||||||
lhsPackMap.prefetch(0);
|
lhsPackMap.template prefetch<lhsProgress, depthProgress>(0);
|
||||||
if(rhsProgress > 1)
|
rhsPackMap.template prefetch<rhsProgress, depthProgress>(0);
|
||||||
rhsPackMap.prefetch(0);
|
|
||||||
|
|
||||||
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
for(; depthIdx + depthProgress <= depth; depthIdx+=depthProgress)
|
||||||
{
|
{
|
||||||
@ -367,9 +381,10 @@ struct LhsLoopStruct
|
|||||||
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
constexpr auto lhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_LHS_DIMENSION];
|
||||||
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
constexpr auto rhsProgress = SHAPES<Architecture, CPU, LhsScalar, RhsScalar>[IDX][SHAPES_RHS_DIMENSION];
|
||||||
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
DepthLoopStruct<Architecture, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, ResPacket, DataMapper, RHS_SHAPE_IDX, IDX, IDX> depthLS;
|
||||||
rhsPackMap.resetCur();
|
//rhsPackMap.resetCur();
|
||||||
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
for(;rowIdx + lhsProgress <= rows; rowIdx+=lhsProgress)
|
||||||
{
|
{
|
||||||
|
rhsPackMap.resetCur();
|
||||||
//lhsPackMap.moveTo(rowIdx);
|
//lhsPackMap.moveTo(rowIdx);
|
||||||
//rhsPackMap.moveTo(colIdx);
|
//rhsPackMap.moveTo(colIdx);
|
||||||
|
|
||||||
|
@ -15,7 +15,8 @@ void set(MatrixXf& A, int m, int n, int id, int digits)
|
|||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
#ifdef __DEBUG__
|
#ifdef __DEBUG__
|
||||||
int m = 9, k = 9, n = 9, max = std::max(std::max(m,k),n);
|
int m = std::atoi(argv[1]), k = std::atoi(argv[1]), n = std::atoi(argv[1]);
|
||||||
|
int max = std::max(std::max(m,k),n);
|
||||||
MatrixXf A = MatrixXf::Zero(m, k);
|
MatrixXf A = MatrixXf::Zero(m, k);
|
||||||
MatrixXf B = MatrixXf::Zero(k, n);
|
MatrixXf B = MatrixXf::Zero(k, n);
|
||||||
MatrixXf C = MatrixXf::Zero(m, n);
|
MatrixXf C = MatrixXf::Zero(m, n);
|
||||||
@ -24,7 +25,8 @@ int main(int argc, char* argv[])
|
|||||||
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
set(A, m, k, 1, static_cast<int>(std::log10(max)) + 1);
|
||||||
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
set(B, k, n, 2, static_cast<int>(std::log10(max)) + 1);
|
||||||
|
|
||||||
C = A*B;
|
for(auto i = 0; i < 2; i++)
|
||||||
|
C = A*B;
|
||||||
|
|
||||||
std::cout << A << std::endl;
|
std::cout << A << std::endl;
|
||||||
std::cout << B << std::endl;
|
std::cout << B << std::endl;
|
||||||
|
8
run.sh
8
run.sh
@ -27,8 +27,14 @@ function run() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run $1 16 500
|
run $1 16 500
|
||||||
|
run $1 21 500
|
||||||
run $1 32 500
|
run $1 32 500
|
||||||
|
run $1 53 500
|
||||||
run $1 64 100
|
run $1 64 100
|
||||||
|
run $1 97 100
|
||||||
run $1 128 50
|
run $1 128 50
|
||||||
|
run $1 203 50
|
||||||
run $1 256 10
|
run $1 256 10
|
||||||
run $1 1024 10
|
run $1 673 10
|
||||||
|
run $1 1024 5
|
||||||
|
run $1 2048 2
|
||||||
|
Loading…
x
Reference in New Issue
Block a user