diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index 00bdb9b8a..b97a090dc 100644 --- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -2,7 +2,7 @@ namespace Eigen { namespace internal { - + #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm. @@ -218,7 +218,9 @@ struct gebp_traits EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - loadRhs(b,dest); + // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS + // using a single scalar value. + eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS."); } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bfae650d4..4a6cef543 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -751,6 +751,9 @@ void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, template struct unpacket_traits > { typedef DoublePacket::half> half; + enum{ + size = 2 * unpacket_traits::size + }; }; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) @@ -2490,7 +2493,13 @@ void gebp_kernel::half>::size; const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; - if ((SwappedTraits::LhsProgress % 4) == 0 && + // The following code assumes we can load SRhsPacket in such a way that + // it multiplies blocks of 4 elements in SLhsPacket. This is not the + // case for some customized kernels (i.e. NEON fp16). If the assumption + // fails, drop down to the scalar path. + constexpr bool kCanLoadSRhsQuad = (unpacket_traits::size < 4) || (unpacket_traits::size % (unpacket_traits::size / 4)) == 0; + if (kCanLoadSRhsQuad && + (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress<=16) && (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) diff --git a/test/product_small.cpp b/test/product_small.cpp index 02d5f35b8..a3a04dc0a 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -281,6 +281,25 @@ void product_small_regressions() } } +template +void product_sweep(int max_m, int max_k, int max_n) { + using Matrix = Eigen::Matrix; + for (int m = 1; m < max_m; ++m) { + for (int n = 1; n < max_n; ++n) { + Matrix C = Matrix::Zero(m, n); + Matrix Cref = Matrix::Zero(m, n); + for (int k = 1; k < max_k; ++k) { + Matrix A = Matrix::Random(m, k); + Matrix B = Matrix::Random(k, n); + C = A * B; + Cref.setZero(); + ref_prod(Cref, A, B); + VERIFY_IS_APPROX(C, Cref); + } + } + } +} + EIGEN_DECLARE_TEST(product_small) { for(int i = 0; i < g_repeat; i++) { @@ -290,7 +309,7 @@ EIGEN_DECLARE_TEST(product_small) CALL_SUBTEST_3( product(Matrix3d()) ); CALL_SUBTEST_4( product(Matrix4d()) ); CALL_SUBTEST_5( product(Matrix4f()) ); - CALL_SUBTEST_50( product(Matrix()) ); + CALL_SUBTEST_10( product(Matrix()) ); CALL_SUBTEST_6( product1x1<0>() ); CALL_SUBTEST_11( test_lazy_l1() ); @@ -317,6 +336,12 @@ EIGEN_DECLARE_TEST(product_small) CALL_SUBTEST_6( bug_1311<5>() ); CALL_SUBTEST_9( test_dynamic_bool() ); + + // Commonly specialized vectorized types. + CALL_SUBTEST_50( product_sweep(10, 10, 10) ); + CALL_SUBTEST_51( product_sweep(10, 10, 10) ); + CALL_SUBTEST_52( product_sweep(10, 10, 10) ); + CALL_SUBTEST_53( product_sweep(10, 10, 10) ); } CALL_SUBTEST_6( product_small_regressions<0>() );