diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 767feb99d..e7dc25478 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -255,11 +255,20 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cj; conj_helper pcj; - // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, - // processing 8 rows at once might be counter productive wrt cache. - const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; - const Index n4 = rows-3; - const Index n2 = rows-1; + // TODO: fine tune the following heuristic. The rationale is that if the + // matrix is very large, processing multiple rows at once might be counter + // productive wrt cache. +#if EIGEN_ARCH_ARM_OR_ARM64 + // This threshold was empirically determined using a Pixel2. + // The little cores are a lot more sensitive to this number + // than the big cores. + const Index cache_threshold = 1024; +#else + const Index cache_threshold = 1024 * 256; +#endif + + const Index row_bytes = lhs.stride() * sizeof(LhsScalar); + const Index n8 = (8 * row_bytes > cache_threshold) ? 0 : (rows - 7); // TODO: for padded aligned inputs, we could enable aligned reads enum { LhsAlignment = Unaligned }; @@ -320,6 +329,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 3); for(; i(ResScalar(0)), @@ -355,6 +367,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product cache_threshold) ? 0 : (rows - 1); for(; i(ResScalar(0)),