Fix conflicts and merge

This commit is contained in:
Gael Guennebaud 2019-01-30 15:57:08 +01:00
commit eb4c6bb22d
20 changed files with 525 additions and 293 deletions

View File

@ -47,7 +47,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
#endif
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@ -263,7 +263,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
#endif
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }

View File

@ -117,14 +117,14 @@ template<> struct unpacket_traits<Packet8f> {
typedef float type;
typedef Packet4f half;
typedef Packet8i integer_packet;
enum {size=8, alignment=Aligned32};
enum {size=8, alignment=Aligned32, vectorizable=true};
};
template<> struct unpacket_traits<Packet4d> {
typedef double type;
typedef Packet2d half;
enum {size=4, alignment=Aligned32};
enum {size=4, alignment=Aligned32, vectorizable=true};
};
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }

View File

@ -50,7 +50,8 @@ template<> struct unpacket_traits<Packet8cf> {
typedef std::complex<float> type;
enum {
size = 8,
alignment=unpacket_traits<Packet16f>::alignment
alignment=unpacket_traits<Packet16f>::alignment,
vectorizable=true
};
typedef Packet4cf half;
};
@ -245,7 +246,8 @@ template<> struct unpacket_traits<Packet4cd> {
typedef std::complex<double> type;
enum {
size = 4,
alignment = unpacket_traits<Packet8d>::alignment
alignment = unpacket_traits<Packet8d>::alignment,
vectorizable=true
};
typedef Packet2cd half;
};

View File

@ -102,19 +102,19 @@ struct unpacket_traits<Packet16f> {
typedef float type;
typedef Packet8f half;
typedef Packet16i integer_packet;
enum { size = 16, alignment=Aligned64 };
enum { size = 16, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet8d> {
typedef double type;
typedef Packet4d half;
enum { size = 8, alignment=Aligned64 };
enum { size = 8, alignment=Aligned64, vectorizable=true };
};
template <>
struct unpacket_traits<Packet16i> {
typedef int type;
typedef Packet8i half;
enum { size = 16, alignment=Aligned64 };
enum { size = 16, alignment=Aligned64, vectorizable=false };
};
template <>

View File

@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@ -286,7 +286,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }

View File

@ -192,13 +192,13 @@ template<> struct unpacket_traits<Packet4f>
typedef float type;
typedef Packet4f half;
typedef Packet4i integer_packet;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=true};
};
template<> struct unpacket_traits<Packet4i>
{
typedef int type;
typedef Packet4i half;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=false};
};
inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
@ -921,7 +921,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
{

View File

@ -90,8 +90,8 @@ template<> struct packet_traits<double> : default_packet_traits
};
template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef float4 half; };
template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef double2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);

View File

@ -41,7 +41,7 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
};
};
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
@ -521,7 +521,7 @@ struct packet_traits<half> : default_packet_traits {
};
template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32, vectorizable=true}; typedef Packet16h half; };
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
Packet16h result;
@ -1003,7 +1003,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true}; typedef Packet8h half; };
template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
Packet8h result;
@ -1359,7 +1359,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
};
template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4h half; };
template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
Packet4h result;

View File

@ -127,7 +127,7 @@ struct packet_traits<std::complex<float> > : default_packet_traits {
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
enum { size = 2, alignment = Aligned16 };
enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2cf half;
};
@ -500,7 +500,7 @@ struct packet_traits<std::complex<double> > : default_packet_traits {
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
enum { size = 1, alignment = Aligned16 };
enum { size = 1, alignment = Aligned16, vectorizable=true };
typedef Packet1cd half;
};

View File

@ -117,14 +117,14 @@ struct packet_traits<int32_t> : default_packet_traits {
template <>
struct unpacket_traits<Packet4f> {
typedef float type;
enum { size = 4, alignment = Aligned16 };
enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4f half;
};
template <>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
enum { size = 4, alignment = Aligned16 };
enum { size = 4, alignment = Aligned16, vectorizable=true };
typedef Packet4i half;
};
@ -925,7 +925,7 @@ struct packet_traits<double> : default_packet_traits {
template <>
struct unpacket_traits<Packet2d> {
typedef double type;
enum { size = 2, alignment = Aligned16 };
enum { size = 2, alignment = Aligned16, vectorizable=true };
typedef Packet2d half;
};

View File

@ -62,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
@ -328,7 +328,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }

View File

@ -145,13 +145,13 @@ template<> struct unpacket_traits<Packet4f>
typedef float type;
typedef Packet4f half;
typedef Packet4i integer_packet;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=true};
};
template<> struct unpacket_traits<Packet4i>
{
typedef int32_t type;
typedef Packet4i half;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=true};
};
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
@ -657,7 +657,7 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }

View File

@ -50,7 +50,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
};
#endif
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@ -283,7 +283,7 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
#endif
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }

View File

@ -166,17 +166,17 @@ template<> struct unpacket_traits<Packet4f> {
typedef float type;
typedef Packet4f half;
typedef Packet4i integer_packet;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=true};
};
template<> struct unpacket_traits<Packet2d> {
typedef double type;
typedef Packet2d half;
enum {size=2, alignment=Aligned16};
enum {size=2, alignment=Aligned16, vectorizable=true};
};
template<> struct unpacket_traits<Packet4i> {
typedef int type;
typedef Packet4i half;
enum {size=4, alignment=Aligned16};
enum {size=4, alignment=Aligned16, vectorizable=false};
};
#ifndef EIGEN_VECTORIZE_AVX

View File

@ -88,7 +88,7 @@ SYCL_ARITHMETIC(cl::sycl::cl_double2)
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
template<> struct unpacket_traits<packet_type> {\
typedef unpacket_type type;\
enum {size=lengths, alignment=Aligned16};\
enum {size=lengths, alignment=Aligned16, vectorizable=true};\
typedef packet_type half;\
};
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)

View File

@ -91,8 +91,8 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits
};
};
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2cf half; };
template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true}; typedef Packet1cd half; };
/* Forward declaration */
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);

View File

@ -239,9 +239,9 @@ template<> struct packet_traits<double> : default_packet_traits
};
};
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4i half; };
template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true}; typedef Packet4f half; };
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true}; typedef Packet2d half; };
/* Forward declaration */
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);

View File

@ -15,7 +15,13 @@ namespace Eigen {
namespace internal {
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target>
enum PacketSizeType {
PacketFull = 0,
PacketHalf,
PacketQuarter
};
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=PacketFull>
class gebp_traits;
@ -365,6 +371,43 @@ struct QuadPacket
const Packet& get(const FixedInt<3>&) const { return B3; }
};
template <int N, typename T1, typename T2, typename T3>
struct packet_conditional { typedef T3 type; };
template <typename T1, typename T2, typename T3>
struct packet_conditional<PacketFull, T1, T2, T3> { typedef T1 type; };
template <typename T1, typename T2, typename T3>
struct packet_conditional<PacketHalf, T1, T2, T3> { typedef T2 type; };
#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
typedef typename packet_conditional<packet_size, \
typename packet_traits<name ## Scalar>::type, \
typename packet_traits<name ## Scalar>::half, \
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
prefix ## name ## Packet
#define PACKET_DECL_COND(name, packet_size) \
typedef typename packet_conditional<packet_size, \
typename packet_traits<name ## Scalar>::type, \
typename packet_traits<name ## Scalar>::half, \
typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
name ## Packet
#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
typedef typename packet_conditional<packet_size, \
typename packet_traits<Scalar>::type, \
typename packet_traits<Scalar>::half, \
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
prefix ## ScalarPacket
#define PACKET_DECL_COND_SCALAR(packet_size) \
typedef typename packet_conditional<packet_size, \
typename packet_traits<Scalar>::type, \
typename packet_traits<Scalar>::half, \
typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
ScalarPacket
/* Vectorization logic
* real*real: unpack rhs to constant packets, ...
*
@ -375,7 +418,7 @@ struct QuadPacket
* cplx*real : unpack rhs to constant packets, ...
* real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
*/
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch>
template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
class gebp_traits
{
public:
@ -383,13 +426,17 @@ public:
typedef _RhsScalar RhsScalar;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
@ -413,9 +460,6 @@ public:
RhsProgress = 1
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
@ -503,21 +547,25 @@ public:
};
template<typename RealScalar, bool _ConjLhs, int Arch>
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch>
template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
{
public:
typedef std::complex<RealScalar> LhsScalar;
typedef RealScalar RhsScalar;
typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
enum {
ConjLhs = _ConjLhs,
ConjRhs = false,
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
nr = 4,
@ -532,10 +580,6 @@ public:
RhsProgress = 1
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
@ -714,8 +758,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
// return res;
// }
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch>
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs,Arch>
template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
{
public:
typedef std::complex<RealScalar> Scalar;
@ -723,15 +767,21 @@ public:
typedef std::complex<RealScalar> RhsScalar;
typedef std::complex<RealScalar> ResScalar;
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
PACKET_DECL_COND(Real, _PacketSize);
PACKET_DECL_COND_SCALAR(_PacketSize);
enum {
ConjLhs = _ConjLhs,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<RealScalar>::Vectorizable
&& packet_traits<Scalar>::Vectorizable,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
Vectorizable = unpacket_traits<RealPacket>::vectorizable
&& unpacket_traits<ScalarPacket>::vectorizable,
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
// FIXME: should depend on NumberOfRegisters
nr = 4,
@ -741,8 +791,6 @@ public:
RhsProgress = 1
};
typedef typename packet_traits<RealScalar>::type RealPacket;
typedef typename packet_traits<Scalar>::type ScalarPacket;
typedef DoublePacket<RealPacket> DoublePacketType;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
@ -876,8 +924,8 @@ protected:
conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
};
template<typename RealScalar, bool _ConjRhs, int Arch>
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch>
template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
{
public:
typedef std::complex<RealScalar> Scalar;
@ -885,14 +933,25 @@ public:
typedef Scalar RhsScalar;
typedef Scalar ResScalar;
PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
#undef PACKET_DECL_COND_SCALAR_PREFIX
#undef PACKET_DECL_COND_PREFIX
#undef PACKET_DECL_COND_SCALAR
#undef PACKET_DECL_COND
enum {
ConjLhs = false,
ConjRhs = _ConjRhs,
Vectorizable = packet_traits<RealScalar>::Vectorizable
&& packet_traits<Scalar>::Vectorizable,
LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
Vectorizable = unpacket_traits<_RealPacket>::vectorizable
&& unpacket_traits<_ScalarPacket>::vectorizable,
LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
// FIXME: should depend on NumberOfRegisters
@ -903,10 +962,6 @@ public:
RhsProgress = 1
};
typedef typename packet_traits<LhsScalar>::type _LhsPacket;
typedef typename packet_traits<RhsScalar>::type _RhsPacket;
typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
@ -998,9 +1053,9 @@ protected:
#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
template<>
struct gebp_traits <float, float, false, false,Architecture::NEON>
: gebp_traits<float,float,false,false,Architecture::Generic>
template<int _PacketSize>
struct gebp_traits <float, float, false, false,Architecture::NEON,_PacketSize>
: gebp_traits<float,float,false,false,Architecture::Generic,_PacketSize>
{
typedef float RhsPacket;
@ -1121,7 +1176,10 @@ struct gebp_traits <double, double, false, false,Architecture::NEON>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketHalf> HalfTraits;
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,PacketQuarter> QuarterTraits;
typedef typename Traits::ResScalar ResScalar;
typedef typename Traits::LhsPacket LhsPacket;
typedef typename Traits::RhsPacket RhsPacket;
@ -1131,19 +1189,34 @@ struct gebp_kernel
typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
typedef typename SwappedTraits::ResScalar SResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
typedef typename SwappedTraits::RhsPacket SRhsPacket;
typedef typename SwappedTraits::ResPacket SResPacket;
typedef typename SwappedTraits::AccPacket SAccPacket;
typedef typename HalfTraits::LhsPacket LhsPacketHalf;
typedef typename HalfTraits::RhsPacket RhsPacketHalf;
typedef typename HalfTraits::ResPacket ResPacketHalf;
typedef typename HalfTraits::AccPacket AccPacketHalf;
typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
typedef typename QuarterTraits::ResPacket ResPacketQuarter;
typedef typename QuarterTraits::AccPacket AccPacketQuarter;
typedef typename DataMapper::LinearMapper LinearMapper;
enum {
Vectorizable = Traits::Vectorizable,
LhsProgress = Traits::LhsProgress,
LhsProgressHalf = HalfTraits::LhsProgress,
LhsProgressQuarter = QuarterTraits::LhsProgress,
RhsProgress = Traits::RhsProgress,
RhsProgressHalf = HalfTraits::RhsProgress,
RhsProgressQuarter = QuarterTraits::RhsProgress,
ResPacketSize = Traits::ResPacketSize
};
@ -1154,11 +1227,11 @@ struct gebp_kernel
};
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs>::LhsProgress>
int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
struct last_row_process_16_packets
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
typedef typename Traits::ResScalar ResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
@ -1186,8 +1259,8 @@ struct last_row_process_16_packets
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
typedef typename Traits::ResScalar ResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
@ -1233,6 +1306,202 @@ struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr,
}
};
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
struct lhs_process_one_packet
{
typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
{
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
}
EIGEN_STRONG_INLINE void operator()(
const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
{
GEBPTraits traits;
// loops on each largest micro horizontal panel of lhs
// (LhsProgress x depth)
for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
{
// loops on each largest micro vertical panel of rhs (depth * nr)
for(Index j2=0; j2<packet_cols4; j2+=nr)
{
// We select a LhsProgress x nr micro block of res
// which is entirely stored into 1 x nr registers.
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
traits.initAcc(C0);
traits.initAcc(C1);
traits.initAcc(C2);
traits.initAcc(C3);
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
prefetch(&blB[0]);
LhsPacket A0;
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
RhsPacketx4 rhs_panel;
RhsPacket T0;
internal::prefetch(blB+(48+0));
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(1, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(3, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
internal::prefetch(blB+(48+16));
peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(5, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
peeled_kc_onestep(7, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
blB += pk*4*RhsProgress;
blA += pk*LhsProgress;
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
RhsPacketx4 rhs_panel;
RhsPacket T0;
peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
blB += 4*RhsProgress;
blA += LhsProgress;
}
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.template loadPacket<ResPacket>(0);
R1 = r1.template loadPacket<ResPacket>(0);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
r0.storePacket(0, R0);
r1.storePacket(0, R1);
R0 = r2.template loadPacket<ResPacket>(0);
R1 = r3.template loadPacket<ResPacket>(0);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
r2.storePacket(0, R0);
r3.storePacket(0, R1);
}
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
// One column at a time
const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0;
traits.initAcc(C0);
LinearMapper r0 = res.getLinearMapper(i, j2);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
LhsPacket A0;
for(Index k= 0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
RhsPacket B_0;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
/* FIXME: why unaligned???? */ \
traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B_0, fix<0>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
} while(false);
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
EIGEN_GEBGP_ONESTEP(3);
EIGEN_GEBGP_ONESTEP(4);
EIGEN_GEBGP_ONESTEP(5);
EIGEN_GEBGP_ONESTEP(6);
EIGEN_GEBGP_ONESTEP(7);
blB += pk*RhsProgress;
blA += pk*LhsProgress;
EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
RhsPacket B_0;
EIGEN_GEBGP_ONESTEP(0);
blB += RhsProgress;
blA += LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.template loadPacket<ResPacket>(0);
traits.acc(C0, alphav, R0);
r0.storePacket(0, R0);
}
}
}
};
template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
{
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
{
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
traits.madd(*A0, *B_0, *C0, *B_0);
traits.madd(*A0, *B1, *C1, *B1);
traits.madd(*A0, *B2, *C2, *B2);
traits.madd(*A0, *B3, *C3, *B3);
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
}
};
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@ -1249,10 +1518,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
const Index peeled_kc = depth & ~(pk-1);
const Index prefetch_res_offset = 32/sizeof(ResScalar);
const int prefetch_res_offset = 32/sizeof(ResScalar);
// const Index depth2 = depth & ~1;
//---------- Process 3 * LhsProgress rows at once ----------
@ -1718,176 +1989,29 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
//---------- Process 1 * LhsProgress rows at once ----------
if(mr>=1*Traits::LhsProgress)
{
// loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
{
// loops on each largest micro vertical panel of rhs (depth * nr)
for(Index j2=0; j2<packet_cols4; j2+=nr)
{
// We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
// stored into 1 x nr registers.
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0, C1, C2, C3;
traits.initAcc(C0);
traits.initAcc(C1);
traits.initAcc(C2);
traits.initAcc(C3);
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
prefetch(&blB[0]);
LhsPacket A0;
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
RhsPacketx4 rhs_panel;
RhsPacket T0;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+4*K)*RhsProgress], rhs_panel); \
traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \
} while(false)
internal::prefetch(blB+(48+0));
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
EIGEN_GEBGP_ONESTEP(3);
internal::prefetch(blB+(48+16));
EIGEN_GEBGP_ONESTEP(4);
EIGEN_GEBGP_ONESTEP(5);
EIGEN_GEBGP_ONESTEP(6);
EIGEN_GEBGP_ONESTEP(7);
blB += pk*4*RhsProgress;
blA += pk*1*LhsProgress;
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
RhsPacketx4 rhs_panel;
RhsPacket T0;
EIGEN_GEBGP_ONESTEP(0);
blB += 4*RhsProgress;
blA += 1*LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(0 * Traits::ResPacketSize, R1);
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r3.storePacket(0 * Traits::ResPacketSize, R1);
}
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
// One column at a time
const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
prefetch(&blA[0]);
// gets res block as register
AccPacket C0;
traits.initAcc(C0);
LinearMapper r0 = res.getLinearMapper(i, j2);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
LhsPacket A0;
for(Index k=0; k<peeled_kc; k+=pk)
{
EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
RhsPacket B_0;
#define EIGEN_GEBGP_ONESTEP(K) \
do { \
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \
traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
traits.madd(A0, B_0, C0, B_0, fix<0>); \
EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \
} while(false);
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
EIGEN_GEBGP_ONESTEP(3);
EIGEN_GEBGP_ONESTEP(4);
EIGEN_GEBGP_ONESTEP(5);
EIGEN_GEBGP_ONESTEP(6);
EIGEN_GEBGP_ONESTEP(7);
blB += pk*RhsProgress;
blA += pk*1*Traits::LhsProgress;
EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
}
// process remaining peeled loop
for(Index k=peeled_kc; k<depth; k++)
{
RhsPacket B_0;
EIGEN_GEBGP_ONESTEP(0);
blB += RhsProgress;
blA += 1*Traits::LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
r0.storePacket(0 * Traits::ResPacketSize, R0);
}
}
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
}
//---------- Process LhsProgressHalf rows at once ----------
if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
{
lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
}
//---------- Process LhsProgressQuarter rows at once ----------
if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
{
lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
}
//---------- Process remaining rows, 1 at once ----------
if(peeled_mc1<rows)
if(peeled_mc_quarter<rows)
{
// loop on each panel of the rhs
for(Index j2=0; j2<packet_cols4; j2+=nr)
{
// loop on each row of the lhs (1*LhsProgress x depth)
for(Index i=peeled_mc1; i<rows; i+=1)
for(Index i=peeled_mc_quarter; i<rows; i+=1)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
@ -2030,7 +2154,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
for(Index j2=packet_cols4; j2<cols; j2++)
{
// loop on each row of the lhs (1*LhsProgress x depth)
for(Index i=peeled_mc1; i<rows; i+=1)
for(Index i=peeled_mc_quarter; i<rows; i+=1)
{
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
@ -2077,7 +2201,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
enum { PacketSize = unpacket_traits<Packet>::size };
typedef typename unpacket_traits<Packet>::half HalfPacket;
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
enum { PacketSize = unpacket_traits<Packet>::size,
HalfPacketSize = unpacket_traits<HalfPacket>::size,
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
HasHalf = (int)HalfPacketSize < (int)PacketSize,
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@ -2089,9 +2219,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
: Pack2>1 ? (rows/Pack2)*Pack2 : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
: Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
Index i=0;
@ -2150,20 +2283,60 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
}
}
// Pack scalars
if(Pack2<PacketSize && Pack2>1)
// Pack half packets
if(HasHalf && Pack1>=HalfPacketSize)
{
for(; i<peeled_mc0; i+=Pack2)
for(; i<peeled_mc_half; i+=HalfPacketSize)
{
if(PanelMode) count += Pack2 * offset;
if(PanelMode) count += (HalfPacketSize) * offset;
for(Index k=0; k<depth; k++)
for(Index w=0; w<Pack2; w++)
blockA[count++] = cj(lhs(i+w, k));
if(PanelMode) count += Pack2 * (stride-offset-depth);
{
HalfPacket A;
A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
pstoreu(blockA+count, cj.pconj(A));
count+=HalfPacketSize;
}
if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
}
}
// Pack quarter packets
if(HasQuarter && Pack1>=QuarterPacketSize)
{
for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
{
if(PanelMode) count += (QuarterPacketSize) * offset;
for(Index k=0; k<depth; k++)
{
QuarterPacket A;
A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
pstoreu(blockA+count, cj.pconj(A));
count+=QuarterPacketSize;
}
if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
}
}
// Pack2 may be *smaller* than PacketSize—that happens for
// products like real * complex, where we have to go half the
// progress on the lhs in order to duplicate those operands to
// address both real & imaginary parts on the rhs. This portion will
// pack those half ones until they match the number expected on the
// last peeling loop at this point (for the rhs).
if(Pack2<PacketSize && Pack2>1)
{
for(; i<peeled_mc0; i+=last_lhs_progress)
{
if(PanelMode) count += last_lhs_progress * offset;
for(Index k=0; k<depth; k++)
for(Index w=0; w<last_lhs_progress; w++)
blockA[count++] = cj(lhs(i+w, k));
if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
}
}
// Pack scalars
for(; i<rows; i++)
{
if(PanelMode) count += offset;
@ -2184,7 +2357,13 @@ template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pa
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
enum { PacketSize = unpacket_traits<Packet>::size };
typedef typename unpacket_traits<Packet>::half HalfPacket;
typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
enum { PacketSize = unpacket_traits<Packet>::size,
HalfPacketSize = unpacket_traits<HalfPacket>::size,
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
HasHalf = (int)HalfPacketSize < (int)PacketSize,
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@ -2192,37 +2371,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
Index count = 0;
bool gone_half = false, gone_quarter = false, gone_last = false;
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
int pack = Pack1;
Index i = 0;
int pack = Pack1;
int psize = PacketSize;
while(pack>0)
{
Index remaining_rows = rows-i;
Index peeled_mc = i+(remaining_rows/pack)*pack;
Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
Index starting_pos = i;
for(; i<peeled_mc; i+=pack)
{
if(PanelMode) count += pack * offset;
const Index peeled_k = (depth/PacketSize)*PacketSize;
Index k=0;
if(pack>=PacketSize)
if(pack>=psize && psize >= QuarterPacketSize)
{
for(; k<peeled_k; k+=PacketSize)
const Index peeled_k = (depth/psize)*psize;
for(; k<peeled_k; k+=psize)
{
for (Index m = 0; m < pack; m += PacketSize)
for (Index m = 0; m < pack; m += psize)
{
PacketBlock<Packet> kernel;
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
if (psize == PacketSize) {
PacketBlock<Packet> kernel;
for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
} else if (HasHalf && psize == HalfPacketSize) {
gone_half = true;
PacketBlock<HalfPacket> kernel_half;
for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
ptranspose(kernel_half);
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
} else if (HasQuarter && psize == QuarterPacketSize) {
gone_quarter = true;
PacketBlock<QuarterPacket> kernel_quarter;
for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
ptranspose(kernel_quarter);
for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
}
}
count += PacketSize*pack;
count += psize*pack;
}
}
for(; k<depth; k++)
{
Index w=0;
@ -2245,9 +2438,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
if(PanelMode) count += pack * (stride-offset-depth);
}
pack -= PacketSize;
if(pack<Pack2 && (pack+PacketSize)!=Pack2)
pack = Pack2;
pack -= psize;
Index left = rows - i;
if (pack <= 0) {
if (!gone_last &&
(starting_pos == i || left >= psize/2 || left >= psize/4) &&
((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
(psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
psize /= 2;
pack = psize;
continue;
}
// Pack2 may be *smaller* than PacketSize—that happens for
// products like real * complex, where we have to go half the
// progress on the lhs in order to duplicate those operands to
// address both real & imaginary parts on the rhs. This portion will
// pack those half ones until they match the number expected on the
// last peeling loop at this point (for the rhs).
if (Pack2 < PacketSize && !gone_last) {
gone_last = true;
psize = pack = left & ~1;
}
}
}
for(; i<rows; i++)

View File

@ -45,14 +45,23 @@ struct symm_pack_lhs
}
void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
{
enum { PacketSize = packet_traits<Scalar>::size };
typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
enum { PacketSize = packet_traits<Scalar>::size,
HalfPacketSize = unpacket_traits<HalfPacket>::size,
QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
HasHalf = (int)HalfPacketSize < (int)PacketSize,
HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
Index count = 0;
//Index peeled_mc3 = (rows/Pack1)*Pack1;
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
if(Pack1>=3*PacketSize)
for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
@ -66,8 +75,16 @@ struct symm_pack_lhs
for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
pack<1*PacketSize>(blockA, lhs, cols, i, count);
if(HasHalf && Pack1>=HalfPacketSize)
for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
pack<HalfPacketSize>(blockA, lhs, cols, i, count);
if(HasQuarter && Pack1>=QuarterPacketSize)
for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
// do the same with mr==1
for(Index i=peeled_mc1; i<rows; i++)
for(Index i=peeled_mc_quarter; i<rows; i++)
{
for(Index k=0; k<i; k++)
blockA[count++] = lhs(i, k); // normal

View File

@ -184,7 +184,8 @@ template<typename T> struct unpacket_traits
enum
{
size = 1,
alignment = 1
alignment = 1,
vectorizable = false
};
};