mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-08 17:59:00 +08:00
Created a NEON version of the ptranspose packet primitives
This commit is contained in:
parent
82b09fcb91
commit
ccb4dec719
@ -263,6 +263,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
|
|||||||
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet2cf>& kernel) {
|
||||||
|
float32x4_t tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
|
||||||
|
kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
|
||||||
|
kernel.packet[1].v = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -447,9 +447,30 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
|
|||||||
PALIGN_NEON(1,Packet4i,vextq_s32)
|
PALIGN_NEON(1,Packet4i,vextq_s32)
|
||||||
PALIGN_NEON(2,Packet4i,vextq_s32)
|
PALIGN_NEON(2,Packet4i,vextq_s32)
|
||||||
PALIGN_NEON(3,Packet4i,vextq_s32)
|
PALIGN_NEON(3,Packet4i,vextq_s32)
|
||||||
|
|
||||||
#undef PALIGN_NEON
|
#undef PALIGN_NEON
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet4f>& kernel) {
|
||||||
|
float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
|
||||||
|
kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
|
||||||
|
kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
|
||||||
|
kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
|
||||||
|
kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_DEVICE_FUNC inline void
|
||||||
|
ptranspose(Kernel<Packet4i>& kernel) {
|
||||||
|
int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
|
||||||
|
int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
|
||||||
|
kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
|
||||||
|
kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
|
||||||
|
kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
|
||||||
|
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
|
||||||
|
}
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
Loading…
x
Reference in New Issue
Block a user