diff --git a/Eigen/src/Core/arch/NEON/Kernels.h b/Eigen/src/Core/arch/NEON/Kernels.h index 011e5d866..c7f56594a 100644 --- a/Eigen/src/Core/arch/NEON/Kernels.h +++ b/Eigen/src/Core/arch/NEON/Kernels.h @@ -61,100 +61,100 @@ namespace internal { // } // }; -// template -// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> -// { -// using LinearMapper = typename DataMapper::LinearMapper; -// using AccPacket = typename packet_traits::type; -// using ResPacket = typename packet_traits::type; +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 1> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; -// AccPacket _acc; + AccPacket _acc; -// EIGEN_STRONG_INLINE void zero() -// { -// _acc = pset1(0); -// } + EIGEN_STRONG_INLINE void zero() + { + _acc = pset1(0); + } -// template -// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) -// { -// _acc *= pAlpha; -// } + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc *= pAlpha; + } -// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) -// { -// PacketBlock block; -// block.packet[0] = dest.template loadPacket(row, col) + _acc; -// dest.template storePacketBlock(row, col, block); -// } -// }; + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + PacketBlock block; + block.packet[0] = dest.template loadPacket(row, col) + _acc; + dest.template storePacketBlock(row, col, block); + } +}; -// template -// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> -// { -// using LinearMapper = typename DataMapper::LinearMapper; -// using AccPacket = typename packet_traits::type; -// using ResPacket = typename packet_traits::type; +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 1, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; -// AccPacket _acc; + AccPacket _acc; -// EIGEN_STRONG_INLINE void zero() -// { -// _acc = pset1(0); -// } + EIGEN_STRONG_INLINE void zero() + { + _acc = pset1(0); + } -// template -// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) -// { -// _acc *= pAlpha; -// } + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc *= pAlpha; + } -// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) -// { -// ResPacket r = dest.template gatherPacket(row, col) + _acc; -// dest.template scatterPacket(row, col, r); -// } -// }; + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + ResPacket r = dest.template gatherPacket(row, col) + _acc; + dest.template scatterPacket(row, col, r); + } +}; -// template -// struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> -// { -// using LinearMapper = typename DataMapper::LinearMapper; -// using AccPacket = typename packet_traits::type; -// using ResPacket = typename packet_traits::type; +template +struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 4, 4> +{ + using LinearMapper = typename DataMapper::LinearMapper; + using AccPacket = typename packet_traits::type; + using ResPacket = typename packet_traits::type; -// PacketBlock _acc; + PacketBlock _acc; -// EIGEN_STRONG_INLINE void zero() -// { -// _acc.packet[0] = pset1(0); -// _acc.packet[1] = pset1(0); -// _acc.packet[2] = pset1(0); -// _acc.packet[3] = pset1(0); -// } + EIGEN_STRONG_INLINE void zero() + { + _acc.packet[0] = pset1(0); + _acc.packet[1] = pset1(0); + _acc.packet[2] = pset1(0); + _acc.packet[3] = pset1(0); + } -// template -// EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) -// { -// _acc.packet[0] *= pAlpha; -// _acc.packet[1] *= pAlpha; -// _acc.packet[2] *= pAlpha; -// _acc.packet[3] *= pAlpha; -// } + template + EIGEN_STRONG_INLINE void scale(ResScalar alpha, const ResPacket_& pAlpha) + { + _acc.packet[0] *= pAlpha; + _acc.packet[1] *= pAlpha; + _acc.packet[2] *= pAlpha; + _acc.packet[3] *= pAlpha; + } -// EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) -// { -// LinearMapper r0 = dest.getLinearMapper(row, col + 0); -// LinearMapper r1 = dest.getLinearMapper(row, col + 1); -// LinearMapper r2 = dest.getLinearMapper(row, col + 2); -// LinearMapper r3 = dest.getLinearMapper(row, col + 3); + EIGEN_STRONG_INLINE void store(const DataMapper& dest, Index row, Index col) + { + LinearMapper r0 = dest.getLinearMapper(row, col + 0); + LinearMapper r1 = dest.getLinearMapper(row, col + 1); + LinearMapper r2 = dest.getLinearMapper(row, col + 2); + LinearMapper r3 = dest.getLinearMapper(row, col + 3); -// r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); -// r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); -// r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); -// r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); -// } -// }; + r0.storePacket(0, r0.template loadPacket(0) + _acc.packet[0]); + r1.storePacket(0, r1.template loadPacket(0) + _acc.packet[1]); + r2.storePacket(0, r2.template loadPacket(0) + _acc.packet[2]); + r3.storePacket(0, r3.template loadPacket(0) + _acc.packet[3]); + } +}; // template // struct Accumulator<0, CPU, Scalar, ResScalar, DataMapper, 8, 4> @@ -316,19 +316,19 @@ namespace internal { // }; // }; -// #define MICRO_4x1x4() \ -// pLhs = pload(lhsPackMap.pCur); \ -// pRhs = pload(rhsPackMap.pCur); \ -// pRhs0 = pset1(pRhs[0]); \ -// pRhs1 = pset1(pRhs[1]); \ -// pRhs2 = pset1(pRhs[2]); \ -// pRhs3 = pset1(pRhs[3]); \ -// acc._acc.packet[0] += pLhs*pRhs0; \ -// acc._acc.packet[1] += pLhs*pRhs1; \ -// acc._acc.packet[2] += pLhs*pRhs2; \ -// acc._acc.packet[3] += pLhs*pRhs3; \ -// lhsPackMap.advance(4*1); \ -// rhsPackMap.advance(1*4); +#define MICRO_4x1x4() \ + pLhs = pload(lhsPackMap.pCur); \ + pRhs = pload(rhsPackMap.pCur); \ + pRhs0 = pset1(pRhs[0]); \ + pRhs1 = pset1(pRhs[1]); \ + pRhs2 = pset1(pRhs[2]); \ + pRhs3 = pset1(pRhs[3]); \ + acc._acc.packet[0] += pLhs*pRhs0; \ + acc._acc.packet[1] += pLhs*pRhs1; \ + acc._acc.packet[2] += pLhs*pRhs2; \ + acc._acc.packet[3] += pLhs*pRhs3; \ + lhsPackMap.advance(4*1); \ + rhsPackMap.advance(1*4); // template // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 8, 4> @@ -381,27 +381,27 @@ namespace internal { // }; // }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x4\n\t"); -// LhsPacket pLhs; -// RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; + LhsPacket pLhs; + RhsPacket pRhs, pRhs0, pRhs1, pRhs2, pRhs3; -// MICRO_4x1x4(); + MICRO_4x1x4(); -// asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t"); -// }; -// }; + asm __volatile__("#END_NEON_MICROKERNEL_4x1x4\n\t"); + }; +}; // template // struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 8, 1, 1> @@ -430,54 +430,54 @@ namespace internal { // }; // }; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using LhsPacket = typename packet_traits::type; -// using RhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 4, 1, 1> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using LhsPacket = typename packet_traits::type; + using RhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_4x1x1\n\t"); -// LhsPacket pLhs = pload(lhsPackMap.pCur); -// RhsPacket pRhs = pset1(*rhsPackMap.pCur); + LhsPacket pLhs = pload(lhsPackMap.pCur); + RhsPacket pRhs = pset1(*rhsPackMap.pCur); -// //acc._acc += pRhs*pLhs; -// acc._acc = pmadd(pRhs, pLhs, acc._acc); + //acc._acc += pRhs*pLhs; + acc._acc = pmadd(pRhs, pLhs, acc._acc); -// lhsPackMap.advance(4*1); -// rhsPackMap.advance(1); -// asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); -// }; -// }; + lhsPackMap.advance(4*1); + rhsPackMap.advance(1); + asm __volatile__("#END_NEON_MICROKERNEL_4x1x1\n\t"); + }; +}; -// template -// struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4> -// { -// EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, -// RhsPackMap& rhsPackMap, -// Index rowIdx, Index colIdx, Index depthIdx, -// Accumulator& acc) -// { -// using RhsPacket = typename packet_traits::type; -// using LhsPacket = typename packet_traits::type; +template +struct MicroKernel<0, CPU, Index, LhsScalar, LhsPackMap, RhsScalar, RhsPackMap, AccScalar, ResScalar, Accumulator, 1, 1, 4> +{ + EIGEN_STRONG_INLINE void operator()(LhsPackMap& lhsPackMap, + RhsPackMap& rhsPackMap, + Index rowIdx, Index colIdx, Index depthIdx, + Accumulator& acc) + { + using RhsPacket = typename packet_traits::type; + using LhsPacket = typename packet_traits::type; -// asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t"); + asm __volatile__("#BEGIN_NEON_MICROKERNEL_1x1x4\n\t"); -// LhsPacket pLhs = pset1(*lhsPackMap.pCur); -// RhsPacket pRhs = pload(rhsPackMap.pCur); + LhsPacket pLhs = pset1(*lhsPackMap.pCur); + RhsPacket pRhs = pload(rhsPackMap.pCur); -// acc._acc += pLhs*pRhs; + acc._acc += pLhs*pRhs; -// lhsPackMap.advance(1); -// rhsPackMap.advance(4*1); -// asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t"); -// }; -// }; + lhsPackMap.advance(1); + rhsPackMap.advance(4*1); + asm __volatile__("#END_NEON_MICROKERNEL_1x1x4\n\t"); + }; +}; } // end namespace internal } // end namespace Eigen