From f0577a2bfd5267a3eb216f4c8797ad0054dec34b Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Tue, 7 Apr 2020 22:09:51 +0000
Subject: [PATCH] Speed up matrix multiplication for small to medium size
 matrices by using half- or quarter-packet vectorized loads in gemm_pack_rhs
 if they have size 4, instead of dropping down the the scalar path.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Benchmark measurements below are for computing ```c.noalias() = a.transpose() * b;``` for square RowMajor matrices of varying size.

Measured improvement with AVX+FMA:

name                           old time/op             new time/op             delta
BM_MatMul_ATB/8                 139ns ± 1%              129ns ± 1%   -7.49%          (p=0.008 n=5+5)
BM_MatMul_ATB/32               1.46µs ± 1%             1.22µs ± 0%  -16.72%          (p=0.008 n=5+5)
BM_MatMul_ATB/64               8.43µs ± 1%             7.41µs ± 0%  -12.04%          (p=0.008 n=5+5)
BM_MatMul_ATB/128              56.8µs ± 1%             52.9µs ± 1%   -6.83%          (p=0.008 n=5+5)
BM_MatMul_ATB/256               407µs ± 1%              395µs ± 3%   -2.94%          (p=0.032 n=5+5)
BM_MatMul_ATB/512              3.27ms ± 3%             3.18ms ± 1%     ~             (p=0.056 n=5+5)


Measured improvement for AVX512:

name                          old time/op             new time/op             delta
BM_MatMul_ATB/8                167ns ± 1%              154ns ± 1%   -7.63%          (p=0.008 n=5+5)
BM_MatMul_ATB/32              1.08µs ± 1%             0.83µs ± 3%  -23.58%          (p=0.008 n=5+5)
BM_MatMul_ATB/64              6.21µs ± 1%             5.06µs ± 1%  -18.47%          (p=0.008 n=5+5)
BM_MatMul_ATB/128             36.1µs ± 2%             31.3µs ± 1%  -13.32%          (p=0.008 n=5+5)
BM_MatMul_ATB/256              263µs ± 2%              242µs ± 2%   -7.92%          (p=0.008 n=5+5)
BM_MatMul_ATB/512             1.95ms ± 2%             1.91ms ± 2%     ~             (p=0.095 n=5+5)
BM_MatMul_ATB/1k              15.4ms ± 4%             14.8ms ± 2%     ~             (p=0.095 n=5+5)
---
 .../src/Core/products/GeneralBlockPanelKernel.h  | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index d0dc14dbd..4b6a3ee6f 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -2653,8 +2653,14 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   typedef typename DataMapper::LinearMapper LinearMapper;
-  enum { PacketSize = packet_traits<Scalar>::size };
+  enum { PacketSize = packet_traits<Scalar>::size,
+         HalfPacketSize = unpacket_traits<HalfPacket>::size,
+         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+         HasHalf = (int)HalfPacketSize < (int)PacketSize,
+         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
@@ -2716,6 +2722,14 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
           Packet A = rhs.template loadPacket<Packet>(k, j2);
           pstoreu(blockB+count, cj.pconj(A));
           count += PacketSize;
+        } else if (HasHalf && HalfPacketSize==4) {
+          HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+          pstoreu(blockB+count, cj.pconj(A));
+          count += HalfPacketSize;
+        } else if (HasQuarter && QuarterPacketSize==4) {
+          QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+          pstoreu(blockB+count, cj.pconj(A));
+          count += QuarterPacketSize;
         } else {
           const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
           blockB[count+0] = cj(dm0(0));