mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 11:49:02 +08:00
Block transposeInPlace() when the matrix is real and square. This yields a large speedup because we transpose in registers (or L1 if we spill), instead of one packet at a time, which in the worst case makes the code write to the same cache line PacketSize times instead of once.
rmlarsen@rmlarsen4:.../eigen_bench/google3$ benchy --benchmarks=.*TransposeInPlace.*float.* --reference=srcfs experimental/users/rmlarsen/bench:matmul_bench 10 / 10 [====================================================================================================================================================================================================================] 100.00% 2m50s (Generated by http://go/benchy. Settings: --runs 5 --benchtime 1s --reference "srcfs" --benchmarks ".*TransposeInPlace.*float.*" experimental/users/rmlarsen/bench:matmul_bench) name old time/op new time/op delta BM_TransposeInPlace<float>/4 9.84ns ± 0% 6.51ns ± 0% -33.80% (p=0.008 n=5+5) BM_TransposeInPlace<float>/8 23.6ns ± 1% 17.6ns ± 0% -25.26% (p=0.016 n=5+4) BM_TransposeInPlace<float>/16 78.8ns ± 0% 60.3ns ± 0% -23.50% (p=0.029 n=4+4) BM_TransposeInPlace<float>/32 302ns ± 0% 229ns ± 0% -24.40% (p=0.008 n=5+5) BM_TransposeInPlace<float>/59 1.03µs ± 0% 0.84µs ± 1% -17.87% (p=0.016 n=5+4) BM_TransposeInPlace<float>/64 1.20µs ± 0% 0.89µs ± 1% -25.81% (p=0.008 n=5+5) BM_TransposeInPlace<float>/128 8.96µs ± 0% 3.82µs ± 2% -57.33% (p=0.008 n=5+5) BM_TransposeInPlace<float>/256 152µs ± 3% 17µs ± 2% -89.06% (p=0.008 n=5+5) BM_TransposeInPlace<float>/512 837µs ± 1% 208µs ± 0% -75.15% (p=0.008 n=5+5) BM_TransposeInPlace<float>/1k 4.28ms ± 2% 1.08ms ± 2% -74.72% (p=0.008 n=5+5)
This commit is contained in:
parent
29f0917a43
commit
b47c777993
@ -243,7 +243,6 @@ struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
|
|
||||||
template<typename MatrixType>
|
template<typename MatrixType>
|
||||||
struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
|
struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
|
||||||
static void run(MatrixType& m) {
|
static void run(MatrixType& m) {
|
||||||
@ -260,16 +259,65 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename MatrixType, Index Alignment>
|
||||||
|
void BlockedInPlaceTranspose(MatrixType& m) {
|
||||||
|
typedef typename MatrixType::Scalar Scalar;
|
||||||
|
typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
|
||||||
|
const Index PacketSize = internal::packet_traits<Scalar>::size;
|
||||||
|
eigen_assert(m.rows() == m.cols());
|
||||||
|
int row_start = 0;
|
||||||
|
for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
|
||||||
|
for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
|
||||||
|
PacketBlock<Packet> A;
|
||||||
|
if (row_start == col_start) {
|
||||||
|
for (Index i=0; i<PacketSize; ++i)
|
||||||
|
A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
|
||||||
|
internal::ptranspose(A);
|
||||||
|
for (Index i=0; i<PacketSize; ++i)
|
||||||
|
m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
|
||||||
|
} else {
|
||||||
|
PacketBlock<Packet> B;
|
||||||
|
for (Index i=0; i<PacketSize; ++i) {
|
||||||
|
A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
|
||||||
|
B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
|
||||||
|
}
|
||||||
|
internal::ptranspose(A);
|
||||||
|
internal::ptranspose(B);
|
||||||
|
for (Index i=0; i<PacketSize; ++i) {
|
||||||
|
m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
|
||||||
|
m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), B.packet[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Index row = row_start; row < m.rows(); ++row) {
|
||||||
|
m.matrix().row(row).swap(m.matrix().col(row));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename MatrixType,bool MatchPacketSize>
|
template<typename MatrixType,bool MatchPacketSize>
|
||||||
struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
|
struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
|
||||||
static void run(MatrixType& m) {
|
static void run(MatrixType& m) {
|
||||||
if (m.rows()==m.cols())
|
typedef typename MatrixType::Scalar Scalar;
|
||||||
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
|
if (m.rows() == m.cols()) {
|
||||||
else
|
const Index PacketSize = internal::packet_traits<Scalar>::size;
|
||||||
|
if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
|
||||||
|
if ((m.rows() % PacketSize) == 0)
|
||||||
|
BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
|
||||||
|
else
|
||||||
|
BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
m = m.transpose().eval();
|
m = m.transpose().eval();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
|
/** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
|
||||||
|
@ -495,7 +495,11 @@ template<typename ArrayType> void array_complex(const ArrayType& m)
|
|||||||
VERIFY_IS_APPROX(m2, m1.transpose());
|
VERIFY_IS_APPROX(m2, m1.transpose());
|
||||||
m2.transposeInPlace();
|
m2.transposeInPlace();
|
||||||
VERIFY_IS_APPROX(m2, m1);
|
VERIFY_IS_APPROX(m2, m1);
|
||||||
|
// Check vectorized inplace transpose.
|
||||||
|
ArrayType m5 = ArrayType::Random(130, 130);
|
||||||
|
ArrayType m6 = m5;
|
||||||
|
m6.transposeInPlace();
|
||||||
|
VERIFY_IS_APPROX(m6, m5.transpose());
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename ArrayType> void min_max(const ArrayType& m)
|
template<typename ArrayType> void min_max(const ArrayType& m)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user