From 2cf6d3050c42d14819160e2642a5c3dfc449e44a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 11:38:19 +0200 Subject: [PATCH 01/11] Disable ignoring attributes warning --- test/vectorization_logic.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 01b55e192..c15f75103 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -22,6 +22,14 @@ #include "main.h" #include +// Disable "ignoring attributes on template argument" +// for packet_traits +// => The only workaround would be to wrap _m128 and the likes +// within wrappers. +#if EIGEN_GNUC_AT_LEAST(6,0) + #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif + using internal::demangle_flags; using internal::demangle_traversal; using internal::demangle_unrolling; From 5a30eed17e170af4aedca1a3ff0c10a8e65bf47e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 16:58:51 +0200 Subject: [PATCH 02/11] Fix warnings in AVX512 --- Eigen/src/Core/arch/AVX512/PacketMath.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9e66575a9..9fbb256a1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -551,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -561,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -572,7 +572,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -582,7 +582,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -660,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ @@ -855,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + __m512d final_output = _mm512_castpd256_pd512(final_0); return _mm512_insertf64x4(final_output, final_1, 1); } From 71496b0e25158fe6c47dd8c959748b74be4ca94c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 17:01:24 +0200 Subject: [PATCH 03/11] Fix gebp kernel for real+complex in case only reals are vectorized (e.g., AVX512). This commit also removes "half-packet" from data-mappers: it was not used and conceptually broken anyways. --- .../Core/products/GeneralBlockPanelKernel.h | 106 +++++++++--------- Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 +- .../products/GeneralMatrixMatrixTriangular.h | 6 +- .../Core/products/SelfadjointMatrixMatrix.h | 6 +- .../Core/products/TriangularMatrixMatrix.h | 6 +- .../Core/products/TriangularSolverMatrix.h | 4 +- Eigen/src/Core/util/BlasUtil.h | 45 +++----- .../CXX11/src/Tensor/TensorContraction.h | 2 +- .../src/Tensor/TensorContractionMapper.h | 75 ++++--------- 9 files changed, 106 insertions(+), 146 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3ec8eb082..fa844815d 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1025,9 +1025,9 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1035,9 +1035,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1045,9 +1045,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1055,9 +1055,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1134,9 +1134,9 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1244,10 +1244,10 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1257,10 +1257,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1337,8 +1337,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1431,15 +1431,15 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); r1.storePacket(0 * Traits::ResPacketSize, R1); - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); + R0 = r2.template loadPacket(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); r2.storePacket(0 * Traits::ResPacketSize, R0); @@ -1504,7 +1504,7 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); r0.storePacket(0 * Traits::ResPacketSize, R0); } @@ -1685,19 +1685,18 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1725,9 +1724,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1745,8 +1744,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1763,7 +1762,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1793,19 +1792,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1837,7 +1835,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1971,10 +1969,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2075,7 +2073,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index bd7b6ff2a..a010e150f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index e436c50a4..27a7921fa 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -110,7 +110,6 @@ struct general_matrix_matrix_triangular_product gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -222,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -299,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b86..96d1b38a9 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -156,11 +156,9 @@ class BlasVectorMapper { }; template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class BlasLinearMapper +{ +public: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -171,29 +169,25 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); - } - - protected: +protected: Scalar *m_data; }; // Lightweight helper class to access matrix coefficients. template -class blas_data_mapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class blas_data_mapper +{ +public: typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -218,8 +212,9 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -227,10 +222,6 @@ class blas_data_mapper { return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); - } - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -251,7 +242,7 @@ class blas_data_mapper { return internal::first_default_aligned(m_data, size); } - protected: +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f0f61fade..9614aeefe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -620,7 +620,7 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; internal::gebp_kernel gebp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index ab320a50d..dbb0f76bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -238,9 +238,6 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::half HalfPacket; - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { @@ -284,27 +281,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper(data); } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - return this->load(i,j); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i,j); } }; @@ -314,7 +294,8 @@ template class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { public: typedef SimpleTensorContractionMapper ParentMapper; @@ -327,12 +308,11 @@ class BaseTensorContractionMapper EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { @@ -340,10 +320,6 @@ class BaseTensorContractionMapperm_tensor.coeff(this->computeIndex(i, j)); return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } }; @@ -354,8 +330,6 @@ template class MakePointer_=MakePointer> class TensorContractionSubMapper { public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; typedef BaseTensorContractionMapper ParentMapper; typedef TensorContractionSubMapper Self; @@ -390,17 +364,20 @@ class TensorContractionSubMapper { return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); + return m_base_mapper.template loadPacket(i, j); } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } template @@ -411,14 +388,8 @@ class TensorContractionSubMapper { return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); - } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -434,15 +405,15 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - template + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } From c50250cb241abb3ea90bac86bc1c27dfadd0862c Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 20 Sep 2018 17:03:42 +0200 Subject: [PATCH 04/11] Avoid warning "suggest braces around initialization of subobject". This test is not run in C++03 mode, so no compatibility is lost. --- unsupported/test/cxx11_tensor_shuffling.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 062dd1c0f..2ec85d2d4 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -83,10 +83,10 @@ static void test_expr_shuffling() Tensor result(5, 7, 3, 2); - array src_slice_dim({2, 3, 1, 7}); - array src_slice_start({0, 0, 0, 0}); - array dst_slice_dim({1, 7, 3, 2}); - array dst_slice_start({0, 0, 0, 0}); + array src_slice_dim{{2, 3, 1, 7}}; + array src_slice_start{{0, 0, 0, 0}}; + array dst_slice_dim{{1, 7, 3, 2}}; + array dst_slice_start{{0, 0, 0, 0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = From e38d1ab4d12720751644ed89ff77c4465be70db8 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 17:07:33 +0200 Subject: [PATCH 05/11] Workaround increases required alignment warning --- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index c4feda87d..8787adcde 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -542,11 +542,15 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* fr } template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from.x); } template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from.x); } template<> EIGEN_STRONG_INLINE Packet16h From a0166ab6514d47bad8db2502c460953af811ea38 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 20 Sep 2018 17:08:43 +0200 Subject: [PATCH 06/11] Workaround for spurious "array subscript is above array bounds" warnings with g++4.x --- unsupported/Eigen/CXX11/src/util/EmulateArray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index d5c000e08..39c255791 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -21,9 +21,9 @@ namespace Eigen { template class array { public: EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; } From 9419f506d0a01a279454f41906ab27c3fe36fe80 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 17:32:34 +0200 Subject: [PATCH 07/11] Fix regression introduced by the previous fix for AVX512. It brokes the complex-complex case on SSE. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 4 ++++ Eigen/src/Core/products/GeneralMatrixMatrix.h | 2 +- Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/SelfadjointMatrixMatrix.h | 6 +++--- Eigen/src/Core/products/TriangularMatrixMatrix.h | 6 +++--- Eigen/src/Core/products/TriangularSolverMatrix.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 2 +- 8 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index fa844815d..b012691c1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -390,6 +390,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -496,6 +497,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -626,6 +628,7 @@ public: typedef typename packet_traits::type ScalarPacket; typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -777,6 +780,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index a010e150f..f49abcad5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 27a7921fa..ec2825bf0 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index c43a53b92..c84c71609 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -352,7 +352,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -222,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -299,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 96d1b38a9..8ff2e9d9d 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 OutputMapper; // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; internal::gebp_kernel gebp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 1d145c4b1..0980854b4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -244,7 +244,7 @@ struct TensorEvaluator + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> LhsPacker; typedef internal::gemm_pack_rhs< RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> From eeeb18814feafcc7a5a2de27e0b8b25554bf7685 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 17:48:56 +0200 Subject: [PATCH 08/11] Fix warning --- test/packetmath.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 58a1c60bf..2b0dda573 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -148,24 +148,25 @@ template void packetmath() for (int offset=0; offset(data1); packets[1] = internal::pload(data1+PacketSize); if (offset==0) internal::palign<0>(packets[0], packets[1]); - else if (offset==1) internal::palign<1>(packets[0], packets[1]); - else if (offset==2) internal::palign<2>(packets[0], packets[1]); - else if (offset==3) internal::palign<3>(packets[0], packets[1]); - else if (offset==4) internal::palign<4>(packets[0], packets[1]); - else if (offset==5) internal::palign<5>(packets[0], packets[1]); - else if (offset==6) internal::palign<6>(packets[0], packets[1]); - else if (offset==7) internal::palign<7>(packets[0], packets[1]); - else if (offset==8) internal::palign<8>(packets[0], packets[1]); - else if (offset==9) internal::palign<9>(packets[0], packets[1]); - else if (offset==10) internal::palign<10>(packets[0], packets[1]); - else if (offset==11) internal::palign<11>(packets[0], packets[1]); - else if (offset==12) internal::palign<12>(packets[0], packets[1]); - else if (offset==13) internal::palign<13>(packets[0], packets[1]); - else if (offset==14) internal::palign<14>(packets[0], packets[1]); - else if (offset==15) internal::palign<15>(packets[0], packets[1]); + else if (offset==1) internal::palign(packets[0], packets[1]); + else if (offset==2) internal::palign(packets[0], packets[1]); + else if (offset==3) internal::palign(packets[0], packets[1]); + else if (offset==4) internal::palign(packets[0], packets[1]); + else if (offset==5) internal::palign(packets[0], packets[1]); + else if (offset==6) internal::palign(packets[0], packets[1]); + else if (offset==7) internal::palign(packets[0], packets[1]); + else if (offset==8) internal::palign(packets[0], packets[1]); + else if (offset==9) internal::palign(packets[0], packets[1]); + else if (offset==10) internal::palign(packets[0], packets[1]); + else if (offset==11) internal::palign(packets[0], packets[1]); + else if (offset==12) internal::palign(packets[0], packets[1]); + else if (offset==13) internal::palign(packets[0], packets[1]); + else if (offset==14) internal::palign(packets[0], packets[1]); + else if (offset==15) internal::palign(packets[0], packets[1]); internal::pstore(data2, packets[0]); for (int i=0; i Date: Thu, 20 Sep 2018 18:07:32 +0200 Subject: [PATCH 09/11] Rename test/array.cpp to test/array_cwise.cpp to avoid conflicts with the array header. --- test/CMakeLists.txt | 2 +- test/{array.cpp => array_cwise.cpp} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename test/{array.cpp => array_cwise.cpp} (99%) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4730cff0..c215bfbb2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -198,7 +198,7 @@ ei_add_test(smallvectors) ei_add_test(mapped_matrix) ei_add_test(mapstride) ei_add_test(mapstaticmethods) -ei_add_test(array) +ei_add_test(array_cwise) ei_add_test(array_for_matrix) ei_add_test(array_replicate) ei_add_test(array_reverse) diff --git a/test/array.cpp b/test/array_cwise.cpp similarity index 99% rename from test/array.cpp rename to test/array_cwise.cpp index d9c4626c0..84e46665b 100644 --- a/test/array.cpp +++ b/test/array_cwise.cpp @@ -453,7 +453,7 @@ template void min_max(const ArrayType& m) } -EIGEN_DECLARE_TEST(array) +EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array(Array()) ); From 3c6dc93f998f2efd52754805458b0fe7172ab5ed Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 18:29:21 +0200 Subject: [PATCH 10/11] Fix GPU support. --- .../CXX11/src/Tensor/TensorContractionGpu.h | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index b5e186d21..056665749 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -549,12 +549,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh #define prefetch_lhs(reg, row, col) \ if (!CHECK_LHS_BOUNDARY) { \ if (col < k_size) { \ - reg =lhs.template loadPacket(row, col); \ + reg =lhs.template loadPacket(row, col); \ } \ } else { \ if (col < k_size) { \ if (row + 3 < m_size) { \ - reg =lhs.template loadPacket(row, col); \ + reg =lhs.template loadPacket(row, col); \ } else if (row + 2 < m_size) { \ reg.x =lhs(row + 0, col); \ reg.y =lhs(row + 1, col); \ @@ -584,7 +584,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -599,7 +599,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -799,37 +799,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -918,8 +918,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -941,8 +941,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -963,7 +963,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); From d37188b9c193aaf087f67e21e654f36855b88ac0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 20 Sep 2018 18:30:10 +0200 Subject: [PATCH 11/11] Fix MPrealSupport --- unsupported/Eigen/MPRealSupport | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 6392bea91..c4ea4ec5f 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -162,6 +162,7 @@ int main() typedef ResScalar LhsPacket; typedef ResScalar RhsPacket; typedef ResScalar ResPacket; + typedef LhsPacket LhsPacket4Packing; };