mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-05-02 00:34:14 +08:00
Speed up row-major matrix-vector product on ARM
The row-major matrix-vector multiplication code uses a threshold to check if processing 8 rows at a time would thrash the cache. This change introduces two modifications to this logic. 1. A smaller threshold for ARM and ARM64 devices. The value of this threshold was determined empirically using a Pixel2 phone, by benchmarking a large number of matrix-vector products in the range [1..4096]x[1..4096] and measuring performance separately on small and little cores with frequency pinning. On big (out-of-order) cores, this change has little to no impact. But on the small (in-order) cores, the matrix-vector products are up to 700% faster. Especially on large matrices. The motivation for this change was some internal code at Google which was using hand-written NEON for implementing similar functionality, processing the matrix one row at a time, which exhibited substantially better performance than Eigen. With the current change, Eigen handily beats that code. 2. Make the logic for choosing number of simultaneous rows apply unifiormly to 8, 4 and 2 rows instead of just 8 rows. Since the default threshold for non-ARM devices is essentially unchanged (32000 -> 32 * 1024), this change has no impact on non-ARM performance. This was verified by running the same set of benchmarks on a Xeon desktop.
This commit is contained in:
parent
7ef879f6bf
commit
b55b5c7280
@ -255,11 +255,20 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
|
||||
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
||||
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
||||
|
||||
// TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
|
||||
// processing 8 rows at once might be counter productive wrt cache.
|
||||
const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
|
||||
const Index n4 = rows-3;
|
||||
const Index n2 = rows-1;
|
||||
// TODO: fine tune the following heuristic. The rationale is that if the
|
||||
// matrix is very large, processing multiple rows at once might be counter
|
||||
// productive wrt cache.
|
||||
#if EIGEN_ARCH_ARM_OR_ARM64
|
||||
// This threshold was empirically determined using a Pixel2.
|
||||
// The little cores are a lot more sensitive to this number
|
||||
// than the big cores.
|
||||
const Index cache_threshold = 1024;
|
||||
#else
|
||||
const Index cache_threshold = 1024 * 256;
|
||||
#endif
|
||||
|
||||
const Index row_bytes = lhs.stride() * sizeof(LhsScalar);
|
||||
const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7);
|
||||
|
||||
// TODO: for padded aligned inputs, we could enable aligned reads
|
||||
enum { LhsAlignment = Unaligned };
|
||||
@ -320,6 +329,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
|
||||
res[(i+6)*resIncr] += alpha*cc6;
|
||||
res[(i+7)*resIncr] += alpha*cc7;
|
||||
}
|
||||
|
||||
if (i == rows) return;
|
||||
const Index n4 = (4 * row_bytes > cache_threshold) ? 0 : (rows - 3);
|
||||
for(; i<n4; i+=4)
|
||||
{
|
||||
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
||||
@ -355,6 +367,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
|
||||
res[(i+2)*resIncr] += alpha*cc2;
|
||||
res[(i+3)*resIncr] += alpha*cc3;
|
||||
}
|
||||
|
||||
if (i == rows) return;
|
||||
const Index n2 = (2 * row_bytes > cache_threshold) ? 0 : (rows - 1);
|
||||
for(; i<n2; i+=2)
|
||||
{
|
||||
ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
|
||||
|
Loading…
x
Reference in New Issue
Block a user