From b47c7779937c1984b7cd2f1d2f8df33d67c396f7 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 27 Apr 2020 18:55:15 -0700 Subject: [PATCH] Block transposeInPlace() when the matrix is real and square. This yields a large speedup because we transpose in registers (or L1 if we spill), instead of one packet at a time, which in the worst case makes the code write to the same cache line PacketSize times instead of once. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rmlarsen@rmlarsen4:.../eigen_bench/google3$ benchy --benchmarks=.*TransposeInPlace.*float.* --reference=srcfs experimental/users/rmlarsen/bench:matmul_bench 10 / 10 [====================================================================================================================================================================================================================] 100.00% 2m50s (Generated by http://go/benchy. Settings: --runs 5 --benchtime 1s --reference "srcfs" --benchmarks ".*TransposeInPlace.*float.*" experimental/users/rmlarsen/bench:matmul_bench) name old time/op new time/op delta BM_TransposeInPlace/4 9.84ns ± 0% 6.51ns ± 0% -33.80% (p=0.008 n=5+5) BM_TransposeInPlace/8 23.6ns ± 1% 17.6ns ± 0% -25.26% (p=0.016 n=5+4) BM_TransposeInPlace/16 78.8ns ± 0% 60.3ns ± 0% -23.50% (p=0.029 n=4+4) BM_TransposeInPlace/32 302ns ± 0% 229ns ± 0% -24.40% (p=0.008 n=5+5) BM_TransposeInPlace/59 1.03µs ± 0% 0.84µs ± 1% -17.87% (p=0.016 n=5+4) BM_TransposeInPlace/64 1.20µs ± 0% 0.89µs ± 1% -25.81% (p=0.008 n=5+5) BM_TransposeInPlace/128 8.96µs ± 0% 3.82µs ± 2% -57.33% (p=0.008 n=5+5) BM_TransposeInPlace/256 152µs ± 3% 17µs ± 2% -89.06% (p=0.008 n=5+5) BM_TransposeInPlace/512 837µs ± 1% 208µs ± 0% -75.15% (p=0.008 n=5+5) BM_TransposeInPlace/1k 4.28ms ± 2% 1.08ms ± 2% -74.72% (p=0.008 n=5+5) --- Eigen/src/Core/Transpose.h | 58 ++++++++++++++++++++++++++++++++++---- test/array_cwise.cpp | 6 +++- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 6dafe1b1e..a3f402cf3 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -243,7 +243,6 @@ struct inplace_transpose_selector { // square matrix } }; -// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template struct inplace_transpose_selector { // PacketSize x PacketSize static void run(MatrixType& m) { @@ -260,16 +259,65 @@ struct inplace_transpose_selector { // PacketSize x Packet } }; + +template +void BlockedInPlaceTranspose(MatrixType& m) { + typedef typename MatrixType::Scalar Scalar; + typedef typename internal::packet_traits::type Packet; + const Index PacketSize = internal::packet_traits::size; + eigen_assert(m.rows() == m.cols()); + int row_start = 0; + for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) { + for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) { + PacketBlock A; + if (row_start == col_start) { + for (Index i=0; i(row_start + i,col_start); + internal::ptranspose(A); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); + } else { + PacketBlock B; + for (Index i=0; i(row_start + i,col_start); + B.packet[i] = m.template packetByOuterInner(col_start + i, row_start); + } + internal::ptranspose(A); + internal::ptranspose(B); + for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); + m.template writePacket(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), B.packet[i]); + } + } + } + } + for (Index row = row_start; row < m.rows(); ++row) { + m.matrix().row(row).swap(m.matrix().col(row)); + } +} + template -struct inplace_transpose_selector { // non square matrix +struct inplace_transpose_selector { // non square or dynamic matrix static void run(MatrixType& m) { - if (m.rows()==m.cols()) - m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); - else + typedef typename MatrixType::Scalar Scalar; + if (m.rows() == m.cols()) { + const Index PacketSize = internal::packet_traits::size; + if (!NumTraits::IsComplex && m.rows() >= PacketSize) { + if ((m.rows() % PacketSize) == 0) + BlockedInPlaceTranspose::Alignment>(m); + else + BlockedInPlaceTranspose(m); + } + else { + m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); + } + } else { m = m.transpose().eval(); + } } }; + } // end namespace internal /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose. diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index e7af1a92f..950cc9650 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -495,7 +495,11 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX(m2, m1.transpose()); m2.transposeInPlace(); VERIFY_IS_APPROX(m2, m1); - + // Check vectorized inplace transpose. + ArrayType m5 = ArrayType::Random(130, 130); + ArrayType m6 = m5; + m6.transposeInPlace(); + VERIFY_IS_APPROX(m6, m5.transpose()); } template void min_max(const ArrayType& m)