mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-05 12:45:11 +08:00
WIP 2
This commit is contained in:
parent
bfadb56107
commit
58db05afbc
@ -17,30 +17,30 @@ namespace internal {
|
|||||||
#ifdef __ENABLE_VECTOR_KERNELS__
|
#ifdef __ENABLE_VECTOR_KERNELS__
|
||||||
|
|
||||||
#define MICRO_12x1x4() \
|
#define MICRO_12x1x4() \
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
|
||||||
lhsPackMap.advance(4*1); \
|
|
||||||
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
|
||||||
lhsPackMap.advance(4*1); \
|
|
||||||
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
|
|
||||||
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
pRhs = pload<RhsPacket>(rhsPackMap.pCur); \
|
||||||
|
rhsPackMap.advance(1*4); \
|
||||||
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
pRhs0 = pset1<RhsPacket>(pRhs[0]); \
|
||||||
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
pRhs1 = pset1<RhsPacket>(pRhs[1]); \
|
||||||
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
pRhs2 = pset1<RhsPacket>(pRhs[2]); \
|
||||||
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
pRhs3 = pset1<RhsPacket>(pRhs[3]); \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
acc._acc1.packet[0] += pLhs*pRhs0; \
|
acc._acc1.packet[0] += pLhs*pRhs0; \
|
||||||
acc._acc1.packet[1] += pLhs*pRhs1; \
|
acc._acc1.packet[1] += pLhs*pRhs1; \
|
||||||
acc._acc1.packet[2] += pLhs*pRhs2; \
|
acc._acc1.packet[2] += pLhs*pRhs2; \
|
||||||
acc._acc1.packet[3] += pLhs*pRhs3; \
|
acc._acc1.packet[3] += pLhs*pRhs3; \
|
||||||
|
pLhs2 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
acc._acc2.packet[0] += pLhs2*pRhs0; \
|
||||||
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
acc._acc2.packet[1] += pLhs2*pRhs1; \
|
||||||
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
acc._acc2.packet[2] += pLhs2*pRhs2; \
|
||||||
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
acc._acc2.packet[3] += pLhs2*pRhs3; \
|
||||||
|
pLhs3 = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
acc._acc3.packet[0] += pLhs3*pRhs0; \
|
||||||
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
acc._acc3.packet[1] += pLhs3*pRhs1; \
|
||||||
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
acc._acc3.packet[2] += pLhs3*pRhs2; \
|
||||||
acc._acc3.packet[3] += pLhs3*pRhs3; \
|
acc._acc3.packet[3] += pLhs3*pRhs3; \
|
||||||
lhsPackMap.advance(4*1); \
|
lhsPackMap.advance(4*1);
|
||||||
rhsPackMap.advance(1*4);
|
|
||||||
|
|
||||||
#define MICRO_8x1x4() \
|
#define MICRO_8x1x4() \
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
@ -76,6 +76,36 @@ namespace internal {
|
|||||||
lhsPackMap.advance(4*1); \
|
lhsPackMap.advance(4*1); \
|
||||||
rhsPackMap.advance(1*4);
|
rhsPackMap.advance(1*4);
|
||||||
|
|
||||||
|
#define MICRO_12x1x1() \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||||
|
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1);
|
||||||
|
|
||||||
|
#define MICRO_8x1x1() \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||||
|
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]); \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1);
|
||||||
|
|
||||||
|
#define MICRO_4x1x1() \
|
||||||
|
pLhs = pload<LhsPacket>(lhsPackMap.pCur); \
|
||||||
|
pRhs = pset1<RhsPacket>(*rhsPackMap.pCur); \
|
||||||
|
acc._acc += pRhs*pLhs; \
|
||||||
|
lhsPackMap.advance(4*1); \
|
||||||
|
rhsPackMap.advance(1);
|
||||||
|
|
||||||
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
template<int CPU, typename Scalar, typename ResScalar, typename DataMapper>
|
||||||
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 12, 1>
|
||||||
{
|
{
|
||||||
@ -581,6 +611,35 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, __UNROLL__, 1>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
|
LhsPacket pLhs;
|
||||||
|
RhsPacket pRhs;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
MICRO_12x1x1();
|
||||||
|
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 12, 1, 1>
|
||||||
{
|
{
|
||||||
@ -592,21 +651,42 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
|
LhsPacket pLhs;
|
||||||
|
RhsPacket pRhs;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
|
MICRO_12x1x1();
|
||||||
|
|
||||||
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, __UNROLL__, 1>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
LhsPacket pLhs;
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
RhsPacket pRhs;
|
||||||
|
|
||||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
MICRO_8x1x1();
|
||||||
lhsPackMap.advance(4*1);
|
MICRO_8x1x1();
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
MICRO_8x1x1();
|
||||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
MICRO_8x1x1();
|
||||||
lhsPackMap.advance(4*1);
|
MICRO_8x1x1();
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
MICRO_8x1x1();
|
||||||
acc._acc.packet[2] = pmadd(pRhs, pLhs, acc._acc.packet[2]);
|
MICRO_8x1x1();
|
||||||
|
MICRO_8x1x1();
|
||||||
|
|
||||||
lhsPackMap.advance(4*1);
|
|
||||||
rhsPackMap.advance(1);
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -624,16 +704,40 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
LhsPacket pLhs;
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
RhsPacket pRhs;
|
||||||
|
|
||||||
acc._acc.packet[0] = pmadd(pRhs, pLhs, acc._acc.packet[0]);
|
MICRO_8x1x1();
|
||||||
lhsPackMap.advance(4*1);
|
|
||||||
pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
acc._acc.packet[1] = pmadd(pRhs, pLhs, acc._acc.packet[1]);
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int CPU, typename Index, typename LhsScalar, typename LhsPackMap, typename RhsScalar, typename RhsPackMap, typename AccScalar, typename ResScalar, typename Accumulator>
|
||||||
|
struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, __UNROLL__, 1>
|
||||||
|
{
|
||||||
|
EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap,
|
||||||
|
RhsPackMap& rhsPackMap,
|
||||||
|
Index rowIdx, Index colIdx, Index depthIdx,
|
||||||
|
Accumulator& acc)
|
||||||
|
{
|
||||||
|
using LhsPacket = typename packet_traits<LhsScalar>::type;
|
||||||
|
using RhsPacket = typename packet_traits<RhsScalar>::type;
|
||||||
|
|
||||||
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
|
LhsPacket pLhs;
|
||||||
|
RhsPacket pRhs;
|
||||||
|
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
MICRO_4x1x1();
|
||||||
|
|
||||||
lhsPackMap.advance(4*1);
|
|
||||||
rhsPackMap.advance(1);
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -651,13 +755,11 @@ struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap,
|
|||||||
|
|
||||||
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
|
|
||||||
LhsPacket pLhs = pload<LhsPacket>(lhsPackMap.pCur);
|
LhsPacket pLhs;
|
||||||
RhsPacket pRhs = pset1<RhsPacket>(*rhsPackMap.pCur);
|
RhsPacket pRhs;
|
||||||
|
|
||||||
acc._acc += pRhs*pLhs;
|
MICRO_4x1x1();
|
||||||
|
|
||||||
lhsPackMap.advance(4*1);
|
|
||||||
rhsPackMap.advance(1);
|
|
||||||
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -23,7 +23,7 @@ namespace internal {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
||||||
constexpr int SHAPES_COUNT = 11;
|
constexpr int SHAPES_COUNT = 14;
|
||||||
|
|
||||||
constexpr int SHAPES_DIMENSION = 6;
|
constexpr int SHAPES_DIMENSION = 6;
|
||||||
constexpr int SHAPES_LHS_DIMENSION = 0;
|
constexpr int SHAPES_LHS_DIMENSION = 0;
|
||||||
@ -49,15 +49,18 @@ template<int Architecture, int CPU, typename LhsScalar, typename RhsScalar>
|
|||||||
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
constexpr int SHAPES[SHAPES_COUNT<Architecture, CPU, LhsScalar,RhsScalar>][SHAPES_DIMENSION] =
|
||||||
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
{ /* 00 */{ 1, 1,1,SHAPES_POINTER_END, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
/* 01 */{1*packet_traits<RhsScalar>::size, 1,1, 0, 0, SHAPES_POINTER_END},
|
||||||
/* 02 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 1, SHAPES_POINTER_END},
|
/* 02 */{1*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 0, 1},
|
||||||
/* 03 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
/* 03 */{2*packet_traits<RhsScalar>::size, 1,1, 0, 2, SHAPES_POINTER_END},
|
||||||
/* 04 */{ 1, 1,4, 3, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
/* 04 */{2*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 2, 3},
|
||||||
/* 05 */{1*packet_traits<RhsScalar>::size, 1,4, 3, 4, SHAPES_POINTER_END},
|
/* 05 */{3*packet_traits<RhsScalar>::size, 1,1, 0, 4, SHAPES_POINTER_END},
|
||||||
/* 06 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 4, SHAPES_POINTER_END},
|
/* 06 */{3*packet_traits<RhsScalar>::size,__UNROLL__,1, 0, 4, 5},
|
||||||
/* 07 */{2*packet_traits<RhsScalar>::size, 1,4, 3, 6, SHAPES_POINTER_END},
|
/* 07 */{ 1, 1,4, 6, SHAPES_POINTER_END, SHAPES_POINTER_END},
|
||||||
/* 08 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 6, 7},
|
/* 08 */{1*packet_traits<RhsScalar>::size, 1,4, 6, 7, SHAPES_POINTER_END},
|
||||||
/* 09 */{3*packet_traits<RhsScalar>::size, 1,4, 3, 8, SHAPES_POINTER_END},
|
/* 09 */{1*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 7, 8},
|
||||||
/* 10 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 3, 8, 9}};
|
/* 10 */{2*packet_traits<RhsScalar>::size, 1,4, 6, 9, SHAPES_POINTER_END},
|
||||||
|
/* 11 */{2*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 9, 10},
|
||||||
|
/* 12 */{3*packet_traits<RhsScalar>::size, 1,4, 6, 11, SHAPES_POINTER_END},
|
||||||
|
/* 13 */{3*packet_traits<RhsScalar>::size,__UNROLL__,4, 6, 11, 12}};
|
||||||
|
|
||||||
// d1progress x d2progress
|
// d1progress x d2progress
|
||||||
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
template<int Architecture, int CPU, typename Scalar, bool isLhs>
|
||||||
|
@ -28,7 +28,6 @@ int main(int argc, char* argv[])
|
|||||||
|
|
||||||
std::cout << A << std::endl;
|
std::cout << A << std::endl;
|
||||||
std::cout << B << std::endl;
|
std::cout << B << std::endl;
|
||||||
std::cout << C << std::endl;
|
|
||||||
|
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
@ -48,8 +47,6 @@ int main(int argc, char* argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << C << std::endl;
|
|
||||||
#else
|
#else
|
||||||
if(argc < 3)
|
if(argc < 3)
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user