Optimize ApplyOnTheRight for Jacobi rotations when FMA is available

This commit is contained in:
Rasmus Munk Larsen 2025-10-12 00:04:19 +00:00
parent 3bd0bfe0e0
commit 565db1c603
2 changed files with 28 additions and 10 deletions

View File

@ -17,6 +17,9 @@
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
return padd(c, this->pmul(x, y)); \
} \
EIGEN_STRONG_INLINE PACKET_CPLX pmsub(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
return psub(this->pmul(x, y), c); \
} \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const { \
return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); \
} \
@ -27,6 +30,9 @@
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
return padd(c, this->pmul(x, y)); \
} \
EIGEN_STRONG_INLINE PACKET_CPLX pmsub(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
return psub(this->pmul(x, y), c); \
} \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const { \
return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); \
} \
@ -76,6 +82,11 @@ struct conj_helper {
return this->pmul(x, y) + c;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmsub(const LhsType& x, const RhsType& y,
const ResultType& c) const {
return this->pmul(x, y) - c;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const {
return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y);
}
@ -104,6 +115,10 @@ struct conj_helper<Packet, Packet, ConjLhs, ConjRhs> {
return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmsub(const Packet& x, const Packet& y, const Packet& c) const {
return Eigen::internal::pmsub(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y));
}
@ -116,6 +131,9 @@ struct conj_helper<Packet, Packet, true, true> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
return Eigen::internal::pmadd(pconj(x), pconj(y), c);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmsub(const Packet& x, const Packet& y, const Packet& c) const {
return Eigen::internal::pmsub(pconj(x), pconj(y), c);
}
// We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
return pconj(Eigen::internal::pmul(x, y));

View File

@ -335,8 +335,8 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
Packet xi = pload<Packet>(px);
Packet yi = pload<Packet>(py);
pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
pstore(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
px += PacketSize;
py += PacketSize;
}
@ -347,18 +347,18 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
Packet xi1 = ploadu<Packet>(px + PacketSize);
Packet yi = pload<Packet>(py);
Packet yi1 = pload<Packet>(py + PacketSize);
pstoreu(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
pstoreu(px + PacketSize, padd(pm.pmul(pc, xi1), pcj.pmul(ps, yi1)));
pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
pstore(py + PacketSize, psub(pcj.pmul(pc, yi1), pm.pmul(ps, xi1)));
pstoreu(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
pstoreu(px + PacketSize, pm.pmadd(pc, xi1, pcj.pmul(ps, yi1)));
pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
pstore(py + PacketSize, pcj.pmsub(pc, yi1, pm.pmul(ps, xi1)));
px += Peeling * PacketSize;
py += Peeling * PacketSize;
}
if (alignedEnd != peelingEnd) {
Packet xi = ploadu<Packet>(x + peelingEnd);
Packet yi = pload<Packet>(y + peelingEnd);
pstoreu(x + peelingEnd, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
pstore(y + peelingEnd, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
pstoreu(x + peelingEnd, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
pstore(y + peelingEnd, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
}
}
@ -381,8 +381,8 @@ struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTi
for (Index i = 0; i < size; i += PacketSize) {
Packet xi = pload<Packet>(px);
Packet yi = pload<Packet>(py);
pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
pstore(px, pm.pmadd(pc, xi, pcj.pmul(ps, yi)));
pstore(py, pcj.pmsub(pc, yi, pm.pmul(ps, xi)));
px += PacketSize;
py += PacketSize;
}