From 1eff6cf8a77f1b8699671d31f8f307a6fd9170ea Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Tue, 20 Feb 2018 16:50:05 -0800 Subject: [PATCH 01/24] Use device's allocate function instead of internal::aligned_malloc. This would make it easier to track memory usage in device instances. --- .../Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d30cc96ab..6fb69910e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -377,7 +377,7 @@ struct TensorEvaluator(bm_ * bk_ * sizeof(LhsScalar), align) * align; size_t rhs_size = divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( + packed_mem_ = static_cast(device_.allocate( (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); char* mem = static_cast(packed_mem_); for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { @@ -399,7 +399,7 @@ struct TensorEvaluator Date: Mon, 25 Jun 2018 05:05:02 -0700 Subject: [PATCH 02/24] Fix AVX512 implementations of psqrt This commit fixes the AVX512 implementations of psqrt in the same way that 3ed67cb0bb4af65fbf243df598604a8c7630bf7d fixed the AVX2 version of this function. The AVX512 versions of psqrt incorrectly return -0.0 for negative values, instead of NaN. Fixing the issues requires adding some additional instructions that slow down the algorithms. A similar test to the one used in 3ed67cb0bb4af65fbf243df598604a8c7630bf7d shows that the corrected Packet16f code runs at 73% of the speed of the existing code, while the corrected Packed8d function runs at 68% of the original. --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 47 +++++++++------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 9c1717f76..81a3b4f62 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -258,48 +258,39 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f half = pmul(_x, pset1(.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d half = pmul(_x, pset1(.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d x = _mm512_rsqrt14_pd(_x); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x)); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> From 636126ef406f03ad830ff9bb9927b7794b19973d Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 24 Jul 2018 20:12:49 +0200 Subject: [PATCH 03/24] Allow to filter out build-error messages --- CTestCustom.cmake.in | 1 + cmake/EigenConfigureTesting.cmake | 1 + 2 files changed, 2 insertions(+) diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in index 9fed9d327..89e487f05 100644 --- a/CTestCustom.cmake.in +++ b/CTestCustom.cmake.in @@ -1,3 +1,4 @@ set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000") set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000") +list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@) diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index b02729fc8..ba88228a0 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -19,6 +19,7 @@ include(CTest) set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests") set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests") +set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out") # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests. # Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target. From fd4fe7cbc5559b0928661dab6d178921b50a15de Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 24 Jul 2018 22:56:15 +0200 Subject: [PATCH 04/24] Fixed issue which made documentation not getting built anymore --- Eigen/src/Core/util/Macros.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 73a9883b9..b15819f7d 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -518,6 +518,8 @@ #endif // Does the compiler support C99? +// Need to include to make sure _GLIBCXX_USE_C99 gets defined +#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ From 705f66a9caa90a32d424270d3bc99ae31925c9f2 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Mon, 23 Jul 2018 16:29:09 -0700 Subject: [PATCH 05/24] Account for missing change on commit "Remove SimpleThreadPool and..." "... always use {NonBlocking}ThreadPool". It seems the non-blocking implementation was me the default/only one, but a reference to the old name was left unmodified. Fix that. --- unsupported/test/cxx11_non_blocking_thread_pool.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 5a8080ea3..e73a034b1 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -18,7 +18,7 @@ static void test_create_destroy_empty_pool() // Just create and destroy the pool. This will wind up and tear down worker // threads. Ensure there are no issues in that logic. for (int i = 0; i < 16; ++i) { - NonBlockingThreadPool tp(i); + ThreadPool tp(i); } } @@ -27,7 +27,7 @@ static void test_parallelism(bool allow_spinning) { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads, allow_spinning); + ThreadPool tp(kThreads, allow_spinning); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -104,7 +104,7 @@ static void test_parallelism(bool allow_spinning) static void test_cancel() { - NonBlockingThreadPool tp(2); + ThreadPool tp(2); // Schedule a large number of closure that each sleeps for one second. This // will keep the thread pool busy for much longer than the default test timeout. From 44ee201337113eeebb1018ba8bebf110afada796 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 25 Jul 2018 20:26:15 +0200 Subject: [PATCH 06/24] Rename variable which shadows class name --- unsupported/Eigen/src/Polynomials/Companion.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h index 41a4efc2f..126be783b 100644 --- a/unsupported/Eigen/src/Polynomials/Companion.h +++ b/unsupported/Eigen/src/Polynomials/Companion.h @@ -89,13 +89,13 @@ class companion { const Index deg = m_monic.size(); const Index deg_1 = deg-1; - DenseCompanionMatrixType companion(deg,deg); - companion << + DenseCompanionMatrixType companMat(deg,deg); + companMat << ( LeftBlock(deg,deg_1) << LeftBlockFirstRow::Zero(1,deg_1), BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished() , m_monic; - return companion; + return companMat; } From 5f79b7f9a9ec8addba78a28a120a4ab84e8164c3 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Wed, 25 Jul 2018 21:47:45 +0200 Subject: [PATCH 07/24] Removed several shadowing types and use global Index typedef everywhere --- .../src/MatrixFunctions/MatrixExponential.h | 1 - .../src/MatrixFunctions/MatrixFunction.h | 22 +++++-------------- .../src/MatrixFunctions/MatrixLogarithm.h | 4 +--- .../Eigen/src/MatrixFunctions/MatrixPower.h | 5 ----- .../src/MatrixFunctions/MatrixSquareRoot.h | 16 +++++--------- 5 files changed, 12 insertions(+), 36 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 03356998b..54037d58d 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -395,7 +395,6 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d template struct MatrixExponentialReturnValue : public ReturnByValue > { - typedef typename Derived::Index Index; public: /** \brief Constructor. * diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index ef50c46a9..133d78625 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -53,7 +53,7 @@ template typename NumTraits::Real matrix_function_compute_mu(const MatrixType& A) { typedef typename plain_col_type::type VectorType; - typename MatrixType::Index rows = A.rows(); + Index rows = A.rows(); const MatrixType N = MatrixType::Identity(rows, rows) - A; VectorType e = VectorType::Ones(rows); N.template triangularView().solveInPlace(e); @@ -65,7 +65,6 @@ MatrixType MatrixFunctionAtomic::compute(const MatrixType& A) { // TODO: Use that A is upper triangular typedef typename NumTraits::Real RealScalar; - typedef typename MatrixType::Index Index; Index rows = A.rows(); Scalar avgEival = A.trace() / Scalar(RealScalar(rows)); MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows); @@ -131,7 +130,6 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf template void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list& clusters) { - typedef typename EivalsType::Index Index; typedef typename EivalsType::RealScalar RealScalar; for (Index i=0; i void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster) { - typedef typename EivalsType::Index Index; eivalToCluster.resize(eivals.rows()); Index clusterIndex = 0; for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) { @@ -205,7 +202,6 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& template void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation) { - typedef typename VectorType::Index Index; DynVectorType indexNextEntry = blockStart; permutation.resize(eivalToCluster.rows()); for (Index i = 0; i < eivalToCluster.rows(); i++) { @@ -219,7 +215,6 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const template void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T) { - typedef typename VectorType::Index Index; for (Index i = 0; i < permutation.rows() - 1; i++) { Index j; for (j = i; j < permutation.rows(); j++) { @@ -247,7 +242,7 @@ template void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT) { fT.setZero(T.rows(), T.cols()); - for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) { + for (Index i = 0; i < clusterSize.rows(); ++i) { fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))); } @@ -285,7 +280,6 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const eigen_assert(C.rows() == A.rows()); eigen_assert(C.cols() == B.rows()); - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index m = A.rows(); @@ -330,11 +324,8 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp { typedef internal::traits Traits; typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::Index Index; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; static const int Options = MatrixType::Options; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; for (Index k = 1; k < clusterSize.rows(); k++) { for (Index i = 0; i < clusterSize.rows() - k; i++) { @@ -481,7 +472,6 @@ template class MatrixFunctionReturnValue { public: typedef typename Derived::Scalar Scalar; - typedef typename Derived::Index Index; typedef typename internal::stem_function::type StemFunction; protected: @@ -506,10 +496,8 @@ template class MatrixFunctionReturnValue typedef typename internal::nested_eval::type NestedEvalType; typedef typename internal::remove_all::type NestedEvalTypeClean; typedef internal::traits Traits; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; typedef std::complex::Real> ComplexScalar; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; typedef internal::MatrixFunctionAtomic AtomicType; AtomicType atomic(m_f); diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index ff8f6e732..a8d879a12 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -332,10 +332,8 @@ public: typedef typename internal::nested_eval::type DerivedEvalType; typedef typename internal::remove_all::type DerivedEvalTypeClean; typedef internal::traits Traits; - static const int RowsAtCompileTime = Traits::RowsAtCompileTime; - static const int ColsAtCompileTime = Traits::ColsAtCompileTime; typedef std::complex::Real> ComplexScalar; - typedef Matrix DynMatrixType; + typedef Matrix DynMatrixType; typedef internal::MatrixLogarithmAtomic AtomicType; AtomicType atomic; diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index 33609aea9..1ceb5cf39 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -40,7 +40,6 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen { public: typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; /** * \brief Constructor. @@ -94,7 +93,6 @@ class MatrixPowerAtomic : internal::noncopyable typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef std::complex ComplexScalar; - typedef typename MatrixType::Index Index; typedef Block ResultType; const MatrixType& m_A; @@ -340,7 +338,6 @@ class MatrixPower : internal::noncopyable private: typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::Index Index; public: /** @@ -600,7 +597,6 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue ComplexScalar; - typedef typename Derived::Index Index; /** * \brief Constructor. diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h index afd88ec4d..34bf78913 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h @@ -17,7 +17,7 @@ namespace internal { // pre: T.block(i,i,2,2) has complex conjugate eigenvalues // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2) template -void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT) { // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere // in EigenSolver. If we expose it, we could call it directly from here. @@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typena // all blocks of sqrtT to left of and below (i,j) are correct // post: sqrtT(i,j) has the correct value template -void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value(); @@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, ty // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix rhs = T.template block<1,2>(i,j); @@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, ty // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix rhs = T.template block<2,1>(i,j); @@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const // similar to compute1x1offDiagonalBlock() template -void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT) +void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) { typedef typename traits::Scalar Scalar; Matrix A = sqrtT.template block<2,2>(i,i); @@ -120,7 +120,6 @@ template void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT) { using std::sqrt; - typedef typename MatrixType::Index Index; const Index size = T.rows(); for (Index i = 0; i < size; i++) { if (i == size - 1 || T.coeff(i+1, i) == 0) { @@ -139,7 +138,6 @@ void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrt template void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT) { - typedef typename MatrixType::Index Index; const Index size = T.rows(); for (Index j = 1; j < size; j++) { if (T.coeff(j, j-1) != 0) // if T(j-1:j, j-1:j) is a 2-by-2 block @@ -206,8 +204,7 @@ template void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result) { using std::sqrt; - typedef typename MatrixType::Index Index; - typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Scalar Scalar; eigen_assert(arg.rows() == arg.cols()); @@ -318,7 +315,6 @@ template class MatrixSquareRootReturnValue : public ReturnByValue > { protected: - typedef typename Derived::Index Index; typedef typename internal::ref_selector::type DerivedNested; public: From 5e79402b4a742ef33574a568689c70be4f3d8549 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Tue, 24 Jul 2018 21:59:15 +0200 Subject: [PATCH 08/24] fix warnings for doc-eigen-prerequisites --- doc/snippets/MatrixBase_cwiseEqual.cpp | 2 +- doc/snippets/MatrixBase_cwiseNotEqual.cpp | 2 +- unsupported/doc/examples/FFT.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/snippets/MatrixBase_cwiseEqual.cpp b/doc/snippets/MatrixBase_cwiseEqual.cpp index eb3656f4c..469af642c 100644 --- a/doc/snippets/MatrixBase_cwiseEqual.cpp +++ b/doc/snippets/MatrixBase_cwiseEqual.cpp @@ -3,5 +3,5 @@ m << 1, 0, 1, 1; cout << "Comparing m with identity matrix:" << endl; cout << m.cwiseEqual(MatrixXi::Identity(2,2)) << endl; -int count = m.cwiseEqual(MatrixXi::Identity(2,2)).count(); +Index count = m.cwiseEqual(MatrixXi::Identity(2,2)).count(); cout << "Number of coefficients that are equal: " << count << endl; diff --git a/doc/snippets/MatrixBase_cwiseNotEqual.cpp b/doc/snippets/MatrixBase_cwiseNotEqual.cpp index 6a2e4fb6c..7f0a105d6 100644 --- a/doc/snippets/MatrixBase_cwiseNotEqual.cpp +++ b/doc/snippets/MatrixBase_cwiseNotEqual.cpp @@ -3,5 +3,5 @@ m << 1, 0, 1, 1; cout << "Comparing m with identity matrix:" << endl; cout << m.cwiseNotEqual(MatrixXi::Identity(2,2)) << endl; -int count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count(); +Index count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count(); cout << "Number of coefficients that are not equal: " << count << endl; diff --git a/unsupported/doc/examples/FFT.cpp b/unsupported/doc/examples/FFT.cpp index fcbf81276..85e8a0241 100644 --- a/unsupported/doc/examples/FFT.cpp +++ b/unsupported/doc/examples/FFT.cpp @@ -61,14 +61,14 @@ template void RandomFill(std::vector & vec) { for (size_t k=0;k void RandomFill(std::vector > & vec) { for (size_t k=0;k ( T( rand() )/T(RAND_MAX) - .5, T( rand() )/T(RAND_MAX) - .5); + vec[k] = std::complex ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5)); } template @@ -85,7 +85,7 @@ void fwd_inv(size_t nfft) vector timebuf2; fft.inv(timebuf2,freqbuf); - long double rmse = mag2(timebuf - timebuf2) / mag2(timebuf); + T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf); cout << "roundtrip rmse: " << rmse << endl; } From 397b0547e1e3151baa64c9677bef5882ad24d1ea Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 26 Jul 2018 00:01:24 +0200 Subject: [PATCH 09/24] DIsable static assertions only when necessary and disable double-promotion warnings in that case as well --- test/mixingtypes.cpp | 58 ++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index 38f062f1e..aad63ec2b 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -8,13 +8,27 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// work around "uninitialized" warnings and give that option some testing -#define EIGEN_INITIALIZE_MATRICES_BY_ZERO +#if defined(EIGEN_TEST_PART_7) #ifndef EIGEN_NO_STATIC_ASSERT #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them #endif +// ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway: +// TODO do the same for MSVC? +#if defined(__clang__) +# if (__clang_major__ * 100 + __clang_minor__) >= 308 +# pragma clang diagnostic ignored "-Wdouble-promotion" +# endif +#elif defined(__GNUC__) + // TODO is there a minimal GCC version for this? At least g++-4.7 seems to be fine with this. +# pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif + +#endif + + + #if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3) #ifndef EIGEN_DONT_VECTORIZE @@ -35,6 +49,28 @@ using namespace std; VERIFY_IS_APPROX(XPR,REF); \ VERIFY( g_called && #XPR" not properly optimized"); +template +void raise_assertion(Index size = SizeAtCompileType) +{ + // VERIFY_RAISES_ASSERT(mf+md); // does not even compile + Matrix vf; vf.setRandom(size); + Matrix vd; vd.setRandom(size); + VERIFY_RAISES_ASSERT(vf=vd); + VERIFY_RAISES_ASSERT(vf+=vd); + VERIFY_RAISES_ASSERT(vf-=vd); + VERIFY_RAISES_ASSERT(vd=vf); + VERIFY_RAISES_ASSERT(vd+=vf); + VERIFY_RAISES_ASSERT(vd-=vf); + + // vd.asDiagonal() * mf; // does not even compile + // vcd.asDiagonal() * mf; // does not even compile + +#if 0 // we get other compilation errors here than just static asserts + VERIFY_RAISES_ASSERT(vd.dot(vf)); +#endif +} + + template void mixingtypes(int size = SizeAtCompileType) { typedef std::complex CF; @@ -73,13 +109,6 @@ template void mixingtypes(int size = SizeAtCompileType) while(std::abs(scf)(); while(std::abs(scd)(); -// VERIFY_RAISES_ASSERT(mf+md); // does not even compile - -#ifdef EIGEN_DONT_VECTORIZE - VERIFY_RAISES_ASSERT(vf=vd); - VERIFY_RAISES_ASSERT(vf+=vd); -#endif - // check scalar products VERIFY_MIX_SCALAR(vcf * sf , vcf * complex(sf)); VERIFY_MIX_SCALAR(sd * vcd , complex(sd) * vcd); @@ -119,9 +148,6 @@ template void mixingtypes(int size = SizeAtCompileType) // check dot product vf.dot(vf); -#if 0 // we get other compilation errors here than just static asserts - VERIFY_RAISES_ASSERT(vd.dot(vf)); -#endif VERIFY_IS_APPROX(vcf.dot(vf), vcf.dot(vf.template cast >())); // check diagonal product @@ -130,9 +156,6 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast >().asDiagonal()); VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast >() * vcd.asDiagonal()); -// vd.asDiagonal() * mf; // does not even compile -// vcd.asDiagonal() * mf; // does not even compile - // check inner product VERIFY_IS_APPROX((vf.transpose() * vcf).value(), (vf.template cast >().transpose() * vcf).value()); @@ -296,5 +319,10 @@ EIGEN_DECLARE_TEST(mixingtypes) CALL_SUBTEST_4(mixingtypes<3>()); CALL_SUBTEST_5(mixingtypes<4>()); CALL_SUBTEST_6(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + CALL_SUBTEST_7(raise_assertion(internal::random(1,EIGEN_TEST_MAX_SIZE))); } + CALL_SUBTEST_7(raise_assertion<0>()); + CALL_SUBTEST_7(raise_assertion<3>()); + CALL_SUBTEST_7(raise_assertion<4>()); + CALL_SUBTEST_7(raise_assertion(0)); } From 2ebcb911b27174c5402e4c7af3d2738fd042a5e2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 26 Jul 2018 14:28:48 -0700 Subject: [PATCH 10/24] Add pcast packet op for NEON. --- Eigen/Core | 1 + Eigen/src/Core/arch/NEON/TypeCasting.h | 48 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 Eigen/src/Core/arch/NEON/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index f336d407b..864bde551 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -179,6 +179,7 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/PacketMath.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" + #include "src/Core/arch/NEON/TypeCasting.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h new file mode 100644 index 000000000..95d1fd0e4 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -0,0 +1,48 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_NEON_H +#define EIGEN_TYPE_CASTING_NEON_H + +namespace Eigen { + +namespace internal { + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vcvtq_s32_f32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vcvtq_f32_s32(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_NEON_H From e4785326255c536214d2cead384477c35e3bdcc6 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 27 Jul 2018 12:36:34 -0700 Subject: [PATCH 11/24] Reduce the number of template specializations of classes related to tensor contraction to reduce binary size. --- .../CXX11/src/Tensor/TensorContraction.h | 105 +++++++++------- .../src/Tensor/TensorContractionBlocking.h | 7 +- .../src/Tensor/TensorContractionThreadPool.h | 118 ++++++++---------- 3 files changed, 109 insertions(+), 121 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 0e69cd40c..57b5339d1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -177,9 +177,9 @@ struct NoOpOutputKernel { */ template EIGEN_ALWAYS_INLINE void operator()( - const OutputKernel::OutputMapper& output_mapper, - const TensorContractionParams& params, Index i, Index j, Index num_rows, - Index num_cols) const {} + const OutputKernel::OutputMapper& /*output_mapper*/, + const TensorContractionParams& /*params*/, Index /*i*/, + Index /*j*/, Index /*num_rows*/, Index /*num_cols*/) const {} }; template @@ -467,42 +467,58 @@ struct TensorContractionEvaluatorBase } } - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHODARGS; \ + } \ + else { \ + METHODARGS; \ + } \ + } \ + else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHODARGS; \ + } \ + else { \ + METHODARGS; \ + } \ + } \ + } \ + else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHODARGS; \ + } \ + else { \ + METHODARGS; \ + } \ + } \ + else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHODARGS; \ + } \ + else { \ + METHODARGS; \ + } \ + } \ } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + } else { + this->template evalGemm(buffer); } } @@ -623,7 +639,7 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); + internal::TensorContractionBlocking blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); @@ -976,14 +992,9 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 8c1af1da8..cf281192c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,13 +21,10 @@ enum { // Default Blocking Strategy -template +template class TensorContractionBlocking { public: - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; - /* adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` @@ -41,7 +38,7 @@ class TensorContractionBlocking { ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function */ - + #if !defined(EIGEN_HIPCC) EIGEN_DEVICE_FUNC #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 8b86d7aaf..182c5f7f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -71,8 +71,7 @@ struct TensorEvaluator + template void evalProduct(Scalar* buffer) const { const Index m = this->m_i_size; const Index n = this->m_j_size; @@ -96,39 +95,6 @@ struct TensorEvaluator::type - LhsScalar; - typedef - typename internal::remove_const::type - RhsScalar; - typedef typename internal::gebp_traits Traits; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; - - - // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) // - task grain sizes (number of kernels executed per task: gm, gn) @@ -158,14 +124,14 @@ struct TensorEvaluator blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); @@ -187,29 +153,22 @@ struct TensorEvaluatortemplate evalGemv(buffer); - else - this->template evalGemm(buffer); + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, + Unaligned, (buffer)); return; } // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); @@ -257,34 +216,55 @@ struct TensorEvaluatorm_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); + #define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ + nn0, shard_by_col, parallel_pack) \ + .run() - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); + TENSOR_CONTRACTION_DISPATCH(Context, Alignment, CONTEXT_ARGS); + +#undef CONTEXT_ARGS - Context(this, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); } // Context coordinates a single parallel gemm operation. - template + template class Context { public: - Context(const Self* self, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::gemm_pack_lhs + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::blas_data_mapper OutputMapper; + typedef internal::gebp_kernel + GebpKernel; + + Context(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, + Index tk, Index bm, Index bn, Index bk, Index nm, Index nn, Index nk, + Index gm, Index gn, Index nm0, Index nn0, bool shard_by_col, bool parallel_pack) : device_(self->m_device), - lhs_(lhs), - rhs_(rhs), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, + self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, + self->m_j_strides, self->m_right_contracting_strides, + self->m_k_strides), buffer_(buffer), output_(buffer, tm), output_kernel_(self->m_output_kernel), @@ -376,8 +356,8 @@ struct TensorEvaluator Date: Mon, 30 Jul 2018 10:19:51 +0100 Subject: [PATCH 12/24] Re-enable FMA for fast sqrt functions This commit re-enables the use of FMA for the FAST sqrt functions. Doing so improves the performance of both algorithms. The float32 version is now 88% the speed of the original function, while the double version is 90%. From bc615e458559480a95a6b6de32295b4aa646c72f Mon Sep 17 00:00:00 2001 From: Mark D Ryan Date: Mon, 30 Jul 2018 13:21:00 +0200 Subject: [PATCH 13/24] Re-enable FMA for fast sqrt functions --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 81a3b4f62..ba1246722 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -258,7 +258,7 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - Packet16f half = pmul(_x, pset1(.5f)); + Packet16f neg_half = pmul(_x, pset1(-.5f)); __mmask16 denormal_mask = _mm512_kand( _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), _CMP_LT_OQ), @@ -267,7 +267,7 @@ psqrt(const Packet16f& _x) { Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); // Flush results for denormals to zero. return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); @@ -276,7 +276,7 @@ psqrt(const Packet16f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - Packet8d half = pmul(_x, pset1(.5f)); + Packet8d neg_half = pmul(_x, pset1(-.5f)); __mmask16 denormal_mask = _mm512_kand( _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), _CMP_LT_OQ), @@ -285,10 +285,10 @@ psqrt(const Packet8d& _x) { Packet8d x = _mm512_rsqrt14_pd(_x); // Do a single step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); // Do a second step of Newton's iteration. - x = pmul(x, psub(pset1(1.5f), pmul(half, pmul(x,x)))); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } From f5cace5e9fc734a9f26b9cf14e806bd1b115e443 Mon Sep 17 00:00:00 2001 From: Patrik Huber Date: Thu, 26 Jul 2018 19:55:19 +0000 Subject: [PATCH 14/24] Fix two small typos in the documentation --- doc/FunctionsTakingEigenTypes.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index e054714f9..6b4e49214 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -79,7 +79,7 @@ These examples are just intended to give the reader a first impression of how fu \section TopicUsingRefClass How to write generic, but non-templated function? -In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated function and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This exactly the purpose of the Ref class. Here is a simple example: +In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example: From 7b91c11207ee94e6a113c2a2c36f4a0797fc1c3e Mon Sep 17 00:00:00 2001 From: Alexey Frunze Date: Tue, 24 Jul 2018 18:36:44 -0700 Subject: [PATCH 15/24] bug #1578: Improve prefetching in matrix multiplication on MIPS. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1890efd4d..3ec8eb082 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -972,7 +972,7 @@ void gebp_kernel Date: Mon, 30 Jul 2018 14:52:15 +0200 Subject: [PATCH 16/24] bug #1577: fix msvc compilation of unit test, msvc defines ptrdiff_t as long long --- test/AnnoyingScalar.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h index 7d1df2253..2b6544a6a 100644 --- a/test/AnnoyingScalar.h +++ b/test/AnnoyingScalar.h @@ -33,6 +33,9 @@ class AnnoyingScalar AnnoyingScalar(float _v) { init(); *v = _v; } AnnoyingScalar(int _v) { init(); *v = _v; } AnnoyingScalar(long _v) { init(); *v = _v; } + #if EIGEN_HAS_CXX11 + AnnoyingScalar(long long _v) { init(); *v = _v; } + #endif AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); } ~AnnoyingScalar() { if(v!=&data) From 679eece8760ce9b9ff09e48b6ee8673afcf94caa Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 31 Jul 2018 10:10:14 +0200 Subject: [PATCH 17/24] Speedup trivial tensor broadcasting on GPU by enforcing unaligned loads. See PR 437. --- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 278689915..e647b3609 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -284,7 +284,13 @@ struct TensorEvaluator, Device> if (static_cast(Layout) == static_cast(ColMajor)) { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); + #else return m_impl.template packet(index); + #endif } else if (oneByN && !nByOne) { return packetNByOne(index); } else if (!oneByN && nByOne) { @@ -296,7 +302,12 @@ struct TensorEvaluator, Device> } } else { if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); + #else return m_impl.template packet(index); + #endif } else if (oneByN && !nByOne) { return packetOneByN(index); } else if (!oneByN && nByOne) { From 9e219bb3d3f0f3a3157dcf8c2a27895e9f85035b Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 10:47:49 +0100 Subject: [PATCH 18/24] Converting ad-hoc inline keyword to EIGEN_STRONG_INLINE MACRO. --- unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 14 ++-- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 14 ++-- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 8 +- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 74 +++++++++---------- .../SpecialFunctionsArrayAPI.h | 18 ++--- .../SpecialFunctionsFunctors.h | 32 ++++---- 8 files changed, 83 insertions(+), 83 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index e3f6e37f0..aed71b265 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -112,7 +112,7 @@ class Tensor : public TensorBase - EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 5ca47cca7..4f973a5b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -41,7 +41,7 @@ template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const& indices, + static EIGEN_STRONG_INLINE Index run(array const& indices, const Dimensions& dimensions) { return array_get(indices) + @@ -54,7 +54,7 @@ template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static inline Index run(array const&, const Dimensions&) + static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) { return 0; } @@ -64,7 +64,7 @@ template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index index, + static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) { const Index mult = (index == n-1) ? 1 : 0; @@ -77,7 +77,7 @@ template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static inline Index run(const Index, + static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) { return 0; @@ -421,20 +421,20 @@ template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; } }; template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { return (array_get(dims1) == array_get(dims2)) & sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index ac5afd891..17008917a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -39,7 +39,7 @@ class TensorExecutor { using StorageIndex = typename Expression::Index; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, + static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = Device()) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -63,7 +63,7 @@ class TensorExecutor evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -111,7 +111,7 @@ class TensorExecutor::NumDimensions; EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, + static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) { using TensorBlock = TensorBlock; @@ -223,7 +223,7 @@ class TensorExecutor { public: using StorageIndex = typename Expression::Index; - static inline void run(const Expression& expr, + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { typedef TensorEvaluator Evaluator; typedef EvalRange EvalRange; @@ -257,7 +257,7 @@ class TensorExecutor::NumDimensions; - static inline void run(const Expression& expr, + static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) { using TensorBlock = TensorBlock; @@ -376,7 +376,7 @@ EigenMetaKernel(Evaluator eval, StorageIndex size) { /*static*/ template -inline void TensorExecutor::run( +EIGEN_STRONG_INLINE void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); @@ -405,7 +405,7 @@ inline void TensorExecutor::run( template class TensorExecutor { public: - static inline void run(const Expression &expr, const SyclDevice &device) { + static EIGEN_STRONG_INLINE void run(const Expression &expr, const SyclDevice &device) { // call TensorSYCL module TensorSycl::run(expr, device); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3d0e4035a..7ecd4d1ac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -20,7 +20,7 @@ namespace internal { template struct scalar_mod_op { EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } const Scalar m_divisor; }; template @@ -34,7 +34,7 @@ struct functor_traits > template struct scalar_mod2_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template struct functor_traits > diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 8810d78cf..98ad661ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -75,10 +75,10 @@ template struct NumTraits > MulCost = 1 }; - EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static inline Real highest() { return n; } - EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real lowest() { return n; } }; namespace internal { diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 8de3bbcab..6c95d0a6c 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -104,9 +104,9 @@ template<> struct h_skip_helper_type<0> template struct h_skip { template - constexpr static inline typename h_skip_helper_numeric::type helper(numeric_list) { return typename h_skip_helper_numeric::type(); } + constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric::type helper(numeric_list) { return typename h_skip_helper_numeric::type(); } template - constexpr static inline typename h_skip_helper_type::type helper(type_list) { return typename h_skip_helper_type::type(); } + constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type::type helper(type_list) { return typename h_skip_helper_type::type(); } }; template struct skip { typedef decltype(h_skip::helper(a())) type; }; @@ -268,7 +268,7 @@ template< typename Reducer > struct reduce { - EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; } + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; } }; template< @@ -276,7 +276,7 @@ template< typename A > struct reduce { - EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; } + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; } }; template< @@ -285,7 +285,7 @@ template< typename... Ts > struct reduce { - EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { return Reducer::run(a, reduce::run(ts...)); } }; @@ -293,29 +293,29 @@ template< /* generic binary operations */ struct sum_op { - template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b) { return a + b; } + template EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) { return a + b; } static constexpr int Identity = 0; }; struct product_op { - template EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b) { return a * b; } + template EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) { return a * b; } static constexpr int Identity = 1; }; -struct logical_and_op { template constexpr static inline auto run(A a, B b) -> decltype(a && b) { return a && b; } }; -struct logical_or_op { template constexpr static inline auto run(A a, B b) -> decltype(a || b) { return a || b; } }; +struct logical_and_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) { return a && b; } }; +struct logical_or_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) { return a || b; } }; -struct equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a == b) { return a == b; } }; -struct not_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a != b) { return a != b; } }; -struct lesser_op { template constexpr static inline auto run(A a, B b) -> decltype(a < b) { return a < b; } }; -struct lesser_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; -struct greater_op { template constexpr static inline auto run(A a, B b) -> decltype(a > b) { return a > b; } }; -struct greater_equal_op { template constexpr static inline auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; +struct equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) { return a == b; } }; +struct not_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) { return a != b; } }; +struct lesser_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct lesser_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; +struct greater_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) { return a > b; } }; +struct greater_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; /* generic unary operations */ -struct not_op { template constexpr static inline auto run(A a) -> decltype(!a) { return !a; } }; -struct negation_op { template constexpr static inline auto run(A a) -> decltype(-a) { return -a; } }; -struct greater_equal_zero_op { template constexpr static inline auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; +struct not_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) { return !a; } }; +struct negation_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) { return -a; } }; +struct greater_equal_zero_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; /* reductions for lists */ @@ -324,13 +324,13 @@ struct greater_equal_zero_op { template constexpr static inline auto // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 // does... template -EIGEN_DEVICE_FUNC constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) { return reduce::run(ts...); } template -constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts... ts) +constexpr EIGEN_STRONG_INLINE decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts... ts) { return reduce::run(ts...); } @@ -338,13 +338,13 @@ constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts /* reverse arrays */ template -constexpr inline Array h_array_reverse(Array arr, numeric_list) +constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list) { return {{array_get(arr)...}}; } template -constexpr inline array array_reverse(array arr) +constexpr EIGEN_STRONG_INLINE array array_reverse(array arr) { return h_array_reverse(arr, typename gen_numeric_list::type()); } @@ -359,7 +359,7 @@ constexpr inline array array_reverse(array arr) // an infinite loop) template struct h_array_reduce { - EIGEN_DEVICE_FUNC constexpr static inline auto run(array arr, T identity) -> decltype(Reducer::run(h_array_reduce::run(arr, identity), array_get(arr))) + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array arr, T identity) -> decltype(Reducer::run(h_array_reduce::run(arr, identity), array_get(arr))) { return Reducer::run(h_array_reduce::run(arr, identity), array_get(arr)); } @@ -368,7 +368,7 @@ struct h_array_reduce { template struct h_array_reduce { - EIGEN_DEVICE_FUNC constexpr static inline T run(const array& arr, T) + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array& arr, T) { return array_get<0>(arr); } @@ -377,14 +377,14 @@ struct h_array_reduce template struct h_array_reduce { - EIGEN_DEVICE_FUNC constexpr static inline T run(const array&, T identity) + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array&, T identity) { return identity; } }; template -EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array& arr, T identity) -> decltype(h_array_reduce::run(arr, identity)) +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array& arr, T identity) -> decltype(h_array_reduce::run(arr, identity)) { return h_array_reduce::run(arr, identity); } @@ -392,13 +392,13 @@ EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array& arr, T i /* standard array reductions */ template -EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array& arr) -> decltype(array_reduce(arr, static_cast(0))) +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array& arr) -> decltype(array_reduce(arr, static_cast(0))) { return array_reduce(arr, static_cast(0)); } template -EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array& arr) -> decltype(array_reduce(arr, static_cast(1))) +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array& arr) -> decltype(array_reduce(arr, static_cast(1))) { return array_reduce(arr, static_cast(1)); } @@ -414,13 +414,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { /* zip an array */ template -constexpr inline array h_array_zip(array a, array b, numeric_list) +constexpr EIGEN_STRONG_INLINE array h_array_zip(array a, array b, numeric_list) { return array{{ Op::run(array_get(a), array_get(b))... }}; } template -constexpr inline array array_zip(array a, array b) +constexpr EIGEN_STRONG_INLINE array array_zip(array a, array b) { return h_array_zip(a, b, typename gen_numeric_list::type()); } @@ -428,13 +428,13 @@ constexpr inline array array_zip(array a, a /* zip an array and reduce the result */ template -constexpr inline auto h_array_zip_and_reduce(array a, array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) +constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array a, array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) { return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); } template -constexpr inline auto array_zip_and_reduce(array a, array b) -> decltype(h_array_zip_and_reduce(a, b, typename gen_numeric_list::type())) +constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array a, array b) -> decltype(h_array_zip_and_reduce(a, b, typename gen_numeric_list::type())) { return h_array_zip_and_reduce(a, b, typename gen_numeric_list::type()); } @@ -442,13 +442,13 @@ constexpr inline auto array_zip_and_reduce(array a, array b) -> decl /* apply stuff to an array */ template -constexpr inline array h_array_apply(array a, numeric_list) +constexpr EIGEN_STRONG_INLINE array h_array_apply(array a, numeric_list) { return array{{ Op::run(array_get(a))... }}; } template -constexpr inline array array_apply(array a) +constexpr EIGEN_STRONG_INLINE array array_apply(array a) { return h_array_apply(a, typename gen_numeric_list::type()); } @@ -456,13 +456,13 @@ constexpr inline array array_apply(array a) /* apply stuff to an array and reduce */ template -constexpr inline auto h_array_apply_and_reduce(array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) +constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) { return reduce::type...>::run(Op::run(array_get(arr))...); } template -constexpr inline auto array_apply_and_reduce(array a) -> decltype(h_array_apply_and_reduce(a, typename gen_numeric_list::type())) +constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array a) -> decltype(h_array_apply_and_reduce(a, typename gen_numeric_list::type())) { return h_array_apply_and_reduce(a, typename gen_numeric_list::type()); } @@ -476,7 +476,7 @@ template struct h_repeat { template - constexpr static inline array run(t v, numeric_list) + constexpr static EIGEN_STRONG_INLINE array run(t v, numeric_list) { return {{ typename id_numeric::type(v)... }}; } diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h index 30cdf4751..ed6d83251 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h @@ -24,7 +24,7 @@ namespace Eigen { * \sa Eigen::igammac(), Eigen::lgamma() */ template -inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( @@ -47,7 +47,7 @@ igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase -inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> igamma_der_a(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( a.derived(), @@ -68,7 +68,7 @@ igamma_der_a(const Eigen::ArrayBase& a, const Eigen::ArrayBase -inline const Eigen::CwiseBinaryOp, const AlphaDerived, const SampleDerived> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const AlphaDerived, const SampleDerived> gamma_sample_der_alpha(const Eigen::ArrayBase& alpha, const Eigen::ArrayBase& sample) { return Eigen::CwiseBinaryOp, const AlphaDerived, const SampleDerived>( alpha.derived(), @@ -86,7 +86,7 @@ gamma_sample_der_alpha(const Eigen::ArrayBase& alpha, const Eigen: * \sa Eigen::igamma(), Eigen::lgamma() */ template -inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( @@ -108,7 +108,7 @@ igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase -inline const Eigen::CwiseBinaryOp, const DerivedN, const DerivedX> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const DerivedN, const DerivedX> polygamma(const Eigen::ArrayBase& n, const Eigen::ArrayBase& x) { return Eigen::CwiseBinaryOp, const DerivedN, const DerivedX>( @@ -128,7 +128,7 @@ polygamma(const Eigen::ArrayBase& n, const Eigen::ArrayBase& * \sa Eigen::betainc(), Eigen::lgamma() */ template -inline const Eigen::CwiseTernaryOp, const ArgADerived, const ArgBDerived, const ArgXDerived> +EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp, const ArgADerived, const ArgBDerived, const ArgXDerived> betainc(const Eigen::ArrayBase& a, const Eigen::ArrayBase& b, const Eigen::ArrayBase& x) { return Eigen::CwiseTernaryOp, const ArgADerived, const ArgBDerived, const ArgXDerived>( @@ -152,7 +152,7 @@ betainc(const Eigen::ArrayBase& a, const Eigen::ArrayBase -inline const Eigen::CwiseBinaryOp, const DerivedX, const DerivedQ> +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const DerivedX, const DerivedQ> zeta(const Eigen::ArrayBase& x, const Eigen::ArrayBase& q) { return Eigen::CwiseBinaryOp, const DerivedX, const DerivedQ>( @@ -176,7 +176,7 @@ zeta(const Eigen::ArrayBase& x, const Eigen::ArrayBase& q) * \sa ArrayBase::i0e() */ template -inline const Eigen::CwiseUnaryOp< +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< Eigen::internal::scalar_i0e_op, const Derived> i0e(const Eigen::ArrayBase& x) { return Eigen::CwiseUnaryOp< @@ -199,7 +199,7 @@ i0e(const Eigen::ArrayBase& x) { * \sa ArrayBase::i1e() */ template -inline const Eigen::CwiseUnaryOp< +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< Eigen::internal::scalar_i1e_op, const Derived> i1e(const Eigen::ArrayBase& x) { return Eigen::CwiseUnaryOp< diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h index 3a63dcdd6..c6fac91bb 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h @@ -155,11 +155,11 @@ struct functor_traits > { */ template struct scalar_lgamma_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::lgamma; return lgamma(a); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); } }; template struct functor_traits > @@ -177,11 +177,11 @@ struct functor_traits > */ template struct scalar_digamma_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::digamma; return digamma(a); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } }; template struct functor_traits > @@ -199,11 +199,11 @@ struct functor_traits > */ template struct scalar_zeta_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const { using numext::zeta; return zeta(x, q); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } }; template struct functor_traits > @@ -221,11 +221,11 @@ struct functor_traits > */ template struct scalar_polygamma_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const { using numext::polygamma; return polygamma(n, x); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } }; template struct functor_traits > @@ -244,11 +244,11 @@ struct functor_traits > */ template struct scalar_erf_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::erf; return erf(a); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perf(a); } }; template struct functor_traits > @@ -267,11 +267,11 @@ struct functor_traits > */ template struct scalar_erfc_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::erfc; return erfc(a); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); } }; template struct functor_traits > @@ -291,12 +291,12 @@ struct functor_traits > template struct scalar_i0e_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_i0e_op) - EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { using numext::i0e; return i0e(x); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pi0e(x); } }; @@ -318,12 +318,12 @@ struct functor_traits > { template struct scalar_i1e_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_i1e_op) - EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& x) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { using numext::i1e; return i1e(x); } typedef typename packet_traits::type Packet; - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pi1e(x); } }; From d7a84148483b1a11b993c037a2cea5b43f2c052f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 11:56:30 +0100 Subject: [PATCH 19/24] Distinguishing between internal memory allocation/deallocation from explicit user memory allocation/deallocation. --- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 8 ++++---- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 6 ++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 7 +++++++ .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 8 ++++++++ .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++------- 6 files changed, 34 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0e4db46de..7b4d56e4b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -112,7 +112,7 @@ struct TensorEvaluator, Devi return false; } else { m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } @@ -120,7 +120,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { if (m_result != NULL) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } @@ -273,7 +273,7 @@ struct TensorEvaluator(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } @@ -281,7 +281,7 @@ struct TensorEvaluatorallocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + stream_->deallocate(buffer); + } virtual void* scratchpad() const { if (scratch_ == NULL) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 90fd99027..5a16ebe50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -105,6 +105,14 @@ struct ThreadPoolDevice { internal::aligned_free(buffer); } + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); + } + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index fdb31928f..09f6f2067 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -115,7 +115,7 @@ struct TensorEvaluator, Device> #endif EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + m_buffer = (CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)); // Should initialize the memory in case we're dealing with non POD types. if (NumTraits::RequireInitialization) { for (Index i = 0; i < numValues; ++i) { @@ -129,7 +129,7 @@ struct TensorEvaluator, Device> return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); + m_device.deallocate_temp(m_buffer); m_buffer = NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..c1cbdebc6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -513,7 +513,7 @@ struct TensorEvaluator, !RunningOnGPU))) { bool need_assign = false; if (!data) { - m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); + m_result = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType))); data = m_result; need_assign = true; } @@ -525,7 +525,7 @@ struct TensorEvaluator, const Index num_values_to_reduce = internal::array_prod(m_reducedDims); const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; } Op reducer(m_reducer); @@ -549,7 +549,7 @@ struct TensorEvaluator, const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; } else { @@ -559,7 +559,7 @@ struct TensorEvaluator, Op reducer(m_reducer); if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -582,7 +582,7 @@ struct TensorEvaluator, const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); if (!data) { if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + data = static_cast(m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; } else { @@ -592,7 +592,7 @@ struct TensorEvaluator, Op reducer(m_reducer); if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } return true; @@ -607,7 +607,7 @@ struct TensorEvaluator, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); if (m_result) { - m_device.deallocate(m_result); + m_device.deallocate_temp(m_result); m_result = NULL; } } From 3a197a60e602ea0cd836438ab717810803dc9074 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 12:19:14 +0100 Subject: [PATCH 20/24] variadic version of assert which can take a parameter pack as its input. --- Eigen/src/Core/util/Macros.h | 24 +++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorMap.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b15819f7d..3255b8351 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1076,4 +1076,28 @@ namespace Eigen { # endif #endif +#ifdef EIGEN_HAS_VARIADIC_TEMPLATES +// Provide a variadic version of assert which can take a parameter pack as its input +// The eigen_assert macro used here might have been redefined to use other macros such as EIGEN_THROW, such as used in Eigen's test suite, therefore this needs to be defined after the other macros. +// Note that this does not provide as nice a string to assert as a straight forward call to eigen_assert, so we add a message to the assert. +#if defined(EIGEN_NO_DEBUG) +#define eigen_variadic_assert(x) +#else +namespace Eigen { +namespace internal { +inline void variadic_assert(const char*) {} +template inline void variadic_assert(const char* message, bool first, Bools ... others) { + eigen_assert(first && message); + variadic_assert(message, others...); + EIGEN_UNUSED_VARIABLE(first); +} +} +} +#define EIGEN_VARIADIC_ASSERT_MESSAGE(x) EIGEN_MAKESTRING(x) " in " __FILE__ ":" EIGEN_MAKESTRING(__LINE__) +#define eigen_variadic_assert(x) \ + do { Eigen::internal::variadic_assert(EIGEN_VARIADIC_ASSERT_MESSAGE(x), x); } while(false); +#endif +#endif + + #endif // EIGEN_MACROS_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index a8e55757e..f69f8f24a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -150,6 +150,7 @@ template class MakePoin EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_variadic_assert((Eigen::NumTraits::highest() >= otherIndices)...); if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; @@ -237,6 +238,7 @@ template class MakePoin EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_variadic_assert((Eigen::NumTraits::highest() >= otherIndices)...); const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); From c84509d7cc5fa3e032da8cfdcd5e82b2897cc5d9 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 12:40:54 +0100 Subject: [PATCH 21/24] Adding new arch/SYCL headers, used for SYCL vectorization. --- Eigen/src/Core/arch/SYCL/InteropHeaders.h | 104 +++++ Eigen/src/Core/arch/SYCL/MathFunctions.h | 221 +++++++++++ Eigen/src/Core/arch/SYCL/PacketMath.h | 458 ++++++++++++++++++++++ Eigen/src/Core/arch/SYCL/TypeCasting.h | 89 +++++ 4 files changed, 872 insertions(+) create mode 100644 Eigen/src/Core/arch/SYCL/InteropHeaders.h create mode 100644 Eigen/src/Core/arch/SYCL/MathFunctions.h create mode 100644 Eigen/src/Core/arch/SYCL/PacketMath.h create mode 100644 Eigen/src/Core/arch/SYCL/TypeCasting.h diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h new file mode 100644 index 000000000..c1da40d14 --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * InteropHeaders.h + * + * \brief: + * InteropHeaders + * +*****************************************************************/ + +#ifndef EIGEN_INTEROP_HEADERS_SYCL_H +#define EIGEN_INTEROP_HEADERS_SYCL_H +#if defined EIGEN_USE_SYCL +namespace Eigen { + +namespace internal { +#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\ + template<> struct packet_traits : default_packet_traits\ + {\ + typedef packet_type type;\ + typedef packet_type half;\ + enum {\ + Vectorizable = 1,\ + AlignedOnScalar = 1,\ + size=lengths,\ + HasHalfPacket = 0,\ + HasDiv = 1,\ + HasLog = 1,\ + HasExp = 1,\ + HasSqrt = 1,\ + HasRsqrt = 1,\ + HasSin = 1,\ + HasCos = 1,\ + HasTan = 1,\ + HasASin = 1,\ + HasACos = 1,\ + HasATan = 1,\ + HasSinh = 1,\ + HasCosh = 1,\ + HasTanh = 1,\ + HasLGamma = 0,\ + HasDiGamma = 0,\ + HasZeta = 0,\ + HasPolygamma = 0,\ + HasErf = 0,\ + HasErfc = 0,\ + HasIGamma = 0,\ + HasIGammac = 0,\ + HasBetaInc = 0,\ + HasBlend = val,\ + HasMax=1,\ + HasMin=1,\ + HasMul=1,\ + HasAdd=1,\ + HasFloor=1,\ + HasRound=1,\ + HasLog1p=1,\ + HasExpm1=1,\ + HasCeil=1,\ + };\ + }; + +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) +#undef SYCL_PACKET_TRAITS + + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic { enum { value = true }; }; +SYCL_ARITHMETIC(cl::sycl::cl_float4) +SYCL_ARITHMETIC(cl::sycl::cl_double2) +#undef SYCL_ARITHMETIC + +#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ +template<> struct unpacket_traits {\ + typedef unpacket_type type;\ + enum {size=lengths, alignment=Aligned16};\ + typedef packet_type half;\ +}; +SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) +SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) + +#undef SYCL_UNPACKET_TRAITS + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_USE_SYCL +#endif // EIGEN_INTEROP_HEADERS_SYCL_H diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h new file mode 100644 index 000000000..422839c6c --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -0,0 +1,221 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * MathFunctions.h + * + * \brief: + * MathFunctions + * +*****************************************************************/ + +#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H +#define EIGEN_MATH_FUNCTIONS_SYCL_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL) +#define SYCL_PLOG(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog(const packet_type& a) { return cl::sycl::log(a); } + +SYCL_PLOG(cl::sycl::cl_float4) +SYCL_PLOG(cl::sycl::cl_double2) +#undef SYCL_PLOG + +#define SYCL_PLOG1P(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog1p(const packet_type& a) { return cl::sycl::log1p(a); } + +SYCL_PLOG1P(cl::sycl::cl_float4) +SYCL_PLOG1P(cl::sycl::cl_double2) +#undef SYCL_PLOG1P + +#define SYCL_PLOG10(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog10(const packet_type& a) { return cl::sycl::log10(a); } + +SYCL_PLOG10(cl::sycl::cl_float4) +SYCL_PLOG10(cl::sycl::cl_double2) +#undef SYCL_PLOG10 + +#define SYCL_PEXP(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pexp(const packet_type& a) { return cl::sycl::exp(a); } + +SYCL_PEXP(cl::sycl::cl_float4) +SYCL_PEXP(cl::sycl::cl_double2) +#undef SYCL_PEXP + +#define SYCL_PEXPM1(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pexpm1(const packet_type& a) { return cl::sycl::expm1(a); } + +SYCL_PEXPM1(cl::sycl::cl_float4) +SYCL_PEXPM1(cl::sycl::cl_double2) +#undef SYCL_PEXPM1 + +#define SYCL_PSQRT(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psqrt(const packet_type& a) { return cl::sycl::sqrt(a); } + +SYCL_PSQRT(cl::sycl::cl_float4) +SYCL_PSQRT(cl::sycl::cl_double2) +#undef SYCL_PSQRT + + +#define SYCL_PRSQRT(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type prsqrt(const packet_type& a) { return cl::sycl::rsqrt(a); } + +SYCL_PRSQRT(cl::sycl::cl_float4) +SYCL_PRSQRT(cl::sycl::cl_double2) +#undef SYCL_PRSQRT + + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSIN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psin(const packet_type& a) { return cl::sycl::sin(a); } + +SYCL_PSIN(cl::sycl::cl_float4) +SYCL_PSIN(cl::sycl::cl_double2) +#undef SYCL_PSIN + + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOS(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pcos(const packet_type& a) { return cl::sycl::cos(a); } + +SYCL_PCOS(cl::sycl::cl_float4) +SYCL_PCOS(cl::sycl::cl_double2) +#undef SYCL_PCOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTAN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type ptan(const packet_type& a) { return cl::sycl::tan(a); } + +SYCL_PTAN(cl::sycl::cl_float4) +SYCL_PTAN(cl::sycl::cl_double2) +#undef SYCL_PTAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PASIN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pasin(const packet_type& a) { return cl::sycl::asin(a); } + +SYCL_PASIN(cl::sycl::cl_float4) +SYCL_PASIN(cl::sycl::cl_double2) +#undef SYCL_PASIN + + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PACOS(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pacos(const packet_type& a) { return cl::sycl::acos(a); } + +SYCL_PACOS(cl::sycl::cl_float4) +SYCL_PACOS(cl::sycl::cl_double2) +#undef SYCL_PACOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PATAN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type patan(const packet_type& a) { return cl::sycl::atan(a); } + +SYCL_PATAN(cl::sycl::cl_float4) +SYCL_PATAN(cl::sycl::cl_double2) +#undef SYCL_PATAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSINH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psinh(const packet_type& a) { return cl::sycl::sinh(a); } + +SYCL_PSINH(cl::sycl::cl_float4) +SYCL_PSINH(cl::sycl::cl_double2) +#undef SYCL_PSINH + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOSH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pcosh(const packet_type& a) { return cl::sycl::cosh(a); } + +SYCL_PCOSH(cl::sycl::cl_float4) +SYCL_PCOSH(cl::sycl::cl_double2) +#undef SYCL_PCOSH + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTANH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type ptanh(const packet_type& a) { return cl::sycl::tanh(a); } + +SYCL_PTANH(cl::sycl::cl_float4) +SYCL_PTANH(cl::sycl::cl_double2) +#undef SYCL_PTANH + +#define SYCL_PCEIL(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pceil(const packet_type& a) { return cl::sycl::ceil(a); } + +SYCL_PCEIL(cl::sycl::cl_float4) +SYCL_PCEIL(cl::sycl::cl_double2) +#undef SYCL_PCEIL + + +#define SYCL_PROUND(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pround(const packet_type& a) { return cl::sycl::round(a); } + +SYCL_PROUND(cl::sycl::cl_float4) +SYCL_PROUND(cl::sycl::cl_double2) +#undef SYCL_PROUND + +#define SYCL_FLOOR(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pfloor(const packet_type& a) { return cl::sycl::floor(a); } + +SYCL_FLOOR(cl::sycl::cl_float4) +SYCL_FLOOR(cl::sycl::cl_double2) +#undef SYCL_FLOOR + + +#define SYCL_PMIN(packet_type, expr) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pmin(const packet_type& a, const packet_type& b) { return expr; } + +SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) +SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) +#undef SYCL_PMIN + +#define SYCL_PMAX(packet_type, expr) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pmax(const packet_type& a, const packet_type& b) { return expr; } + +SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) +SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) +#undef SYCL_PMAX + +//#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h new file mode 100644 index 000000000..820a83311 --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -0,0 +1,458 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * PacketMath.h + * + * \brief: + * PacketMath + * +*****************************************************************/ + +#ifndef EIGEN_PACKET_MATH_SYCL_H +#define EIGEN_PACKET_MATH_SYCL_H +#include +#if defined EIGEN_USE_SYCL +namespace Eigen { + +namespace internal { + +#define SYCL_PLOADT_RO(address_space_target)\ +template\ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + ploadt_ro(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from) {\ + typedef typename unpacket_traits::type scalar;\ + typedef cl::sycl::multi_ptr multi_ptr;\ + auto res=packet_type(static_cast::type>(0));\ + res.load(0, multi_ptr(const_cast(from)));\ + return res;\ +} + +SYCL_PLOADT_RO(global_space) +SYCL_PLOADT_RO(local_space) + +#undef SYCL_PLOADT_RO + + +#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\ +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + pload##AlignedType(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from) {\ + return ploadt_ro(from);\ + } + +// global space +SYCL_PLOAD(global_space, Unaligned, u) +SYCL_PLOAD(global_space, Aligned, ) + +// local space +SYCL_PLOAD(local_space, Unaligned, u) +SYCL_PLOAD(local_space, Aligned, ) + +// private space +//SYCL_PLOAD(private_space, Unaligned, u) +//SYCL_PLOAD(private_space, Aligned, ) + +#undef SYCL_PLOAD + + +/** \internal \returns a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +#define SYCL_PLOADT(address_space_target)\ +template\ +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from)\ +{\ + if(Alignment >= unpacket_traits::alignment)\ + return pload(from);\ + else\ + return ploadu(from);\ +} + +// global space +SYCL_PLOADT(global_space) +// local space +SYCL_PLOADT(local_space) + +//private_space +// There is no need to specialise it for private space as it can use the GenericPacketMath version + +#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\ + template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + ploadt_ro(const typename unpacket_traits::type * from) { \ + typedef typename unpacket_traits::type scalar;\ + auto res=packet_type(static_cast(0));\ + res. template load(0, const_cast(from));\ + return res;\ + } + +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) + + +#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + pload##alignment_type(const typename unpacket_traits::type * from) { \ + typedef typename unpacket_traits::type scalar;\ + auto res=packet_type(static_cast(0));\ + res. template load(0, const_cast(from));\ + return res;\ + } +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) + +#undef SYCL_PLOAD_SPECIAL + +#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\ +template<>\ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + typename cl::sycl::multi_ptr::pointer_t to, \ + const packet_type& from) {\ + typedef cl::sycl::multi_ptr multi_ptr;\ + from.store(0, multi_ptr(to));\ +} + +// global space +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) + + +#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\ +template<>\ +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(\ + scalar* to,\ + const packet_type& from) {\ + if(Alignment)\ + pstore(to, from);\ + else\ + pstoreu(to,from);\ +} + + +SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned) + +SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned) + +SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned) + +SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned) + + +#undef SYCL_PSTORE_T + +#define SYCL_PSET1(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1(\ + const typename unpacket_traits::type& from) {\ + return packet_type(from);\ +} + +// global space +SYCL_PSET1(cl::sycl::cl_float4) +SYCL_PSET1(cl::sycl::cl_double2) + +#undef SYCL_PSET1 + + +template struct get_base_packet { +template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {} + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {} +}; + +template <> struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) { + return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) { + auto tmp = stride; + to[0] = from.x(); + to[tmp] = from.y(); + to[tmp += stride] = from.z(); + to[tmp += stride] = from.w(); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) { + return cl::sycl::cl_float4(static_cast(a), static_cast(a+1), static_cast(a+2), static_cast(a+3)); + } +}; + +template <> struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) { + return cl::sycl::cl_double2(from[0], from[0]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_double2(from[0*stride], from[1*stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) { + to[0] = from.x(); + to[stride] = from.y(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) { + return cl::sycl::cl_double2(static_cast(a), static_cast(a + 1)); + } +}; + +#define SYCL_PLOAD_DUP(address_space_target)\ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +ploaddup(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from)\ +{\ + return get_base_packet::get_ploaddup(from); \ +} + +// global space +SYCL_PLOAD_DUP(global_space) +// local_space +SYCL_PLOAD_DUP(local_space) +// private_space +//SYCL_PLOAD_DUP(private_space) +#undef SYCL_PLOAD_DUP + +#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +ploaddup(const typename unpacket_traits::type * from)\ +{ \ + return get_base_packet::get_ploaddup(from); \ +} + +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) + +#undef SYCL_PLOAD_DUP_SPECILIZE + +#define SYCL_PLSET(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset(const typename unpacket_traits::type& a) {\ + return get_base_packet::set_plset(a);\ +} + +SYCL_PLSET(cl::sycl::cl_float4) +SYCL_PLSET(cl::sycl::cl_double2) + +#undef SYCL_PLSET + + +#define SYCL_PGATHER(address_space_target)\ +template EIGEN_DEVICE_FUNC inline packet_type pgather(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\ + return get_base_packet::get_pgather(from, stride); \ +} + +// global space +SYCL_PGATHER(global_space) +// local space +SYCL_PGATHER(local_space) +// private space +//SYCL_PGATHER(private_space) + +#undef SYCL_PGATHER + + +#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +pgather(const typename unpacket_traits::type * from, Index stride)\ +{ \ + return get_base_packet::get_pgather(from, stride); \ +} + +SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PGATHER_SPECILIZE + +#define SYCL_PSCATTER(address_space_target)\ +template EIGEN_DEVICE_FUNC inline void pscatter(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t to,\ + const packet_type& from, Index stride) {\ + get_base_packet::set_pscatter(to, from, stride);\ +} + +// global space +SYCL_PSCATTER(global_space) +// local space +SYCL_PSCATTER(local_space) +// private space +//SYCL_PSCATTER(private_space) + +#undef SYCL_PSCATTER + + + +#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \ +pscatter(typename unpacket_traits::type * to, const packet_type& from, Index stride)\ +{ \ + get_base_packet::set_pscatter(to, from, stride);\ +} + +SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PSCATTER_SPECILIZE + + +#define SYCL_PMAD(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\ + const packet_type& b, const packet_type& c){\ + return cl::sycl::mad(a,b,c);\ +} + +SYCL_PMAD(cl::sycl::cl_float4) +SYCL_PMAD(cl::sycl::cl_double2) +#undef SYCL_PMAD + + + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst(const cl::sycl::cl_float4& a) { + return a.x(); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst(const cl::sycl::cl_double2& a) { + return a.x(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux(const cl::sycl::cl_float4& a) { + return a.x() + a.y() + a.z() + a.w(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux(const cl::sycl::cl_double2& a) { + return a.x() + a.y(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max(const cl::sycl::cl_float4& a) { + return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max(const cl::sycl::cl_double2& a) { + return cl::sycl::fmax(a.x(), a.y()); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min(const cl::sycl::cl_float4& a) { + return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min(const cl::sycl::cl_double2& a) { + return cl::sycl::fmin(a.x(), a.y()); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul(const cl::sycl::cl_float4& a) { + return a.x() * a.y() * a.z() * a.w(); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul(const cl::sycl::cl_double2& a) { + return a.x() * a.y(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs(const cl::sycl::cl_float4& a) { + return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs(const cl::sycl::cl_double2& a) { + return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y())); +} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +// std::swap(kernel.packet[0].y(), kernel.packet[1].x()); + + tmp = kernel.packet[0].z(); + kernel.packet[0].z() = kernel.packet[2].x(); + kernel.packet[2].x() = tmp; + //std::swap(kernel.packet[0].z(), kernel.packet[2].x()); + + tmp = kernel.packet[0].w(); + kernel.packet[0].w() = kernel.packet[3].x(); + kernel.packet[3].x() = tmp; + + //std::swap(kernel.packet[0].w(), kernel.packet[3].x()); + + tmp = kernel.packet[1].z(); + kernel.packet[1].z() = kernel.packet[2].y(); + kernel.packet[2].y() = tmp; +// std::swap(kernel.packet[1].z(), kernel.packet[2].y()); + + tmp = kernel.packet[1].w(); + kernel.packet[1].w() = kernel.packet[3].y(); + kernel.packet[3].y() = tmp; +// std::swap(kernel.packet[1].w(), kernel.packet[3].y()); + + tmp = kernel.packet[2].w(); + kernel.packet[2].w() = kernel.packet[3].z(); + kernel.packet[3].z() = tmp; +// std::swap(kernel.packet[2].w(), kernel.packet[3].z()); + +} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +//std::swap(kernel.packet[0].y(), kernel.packet[1].x()); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pblend(const Selector::size>& ifPacket, + const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) { + cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1, + ifPacket.select[2] ? 0 : -1, + ifPacket.select[3] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +template<> inline cl::sycl::cl_double2 +pblend(const Selector::size>& ifPacket, + const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) { + cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_USE_SYCL +#endif // EIGEN_PACKET_MATH_SYCL_H diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h new file mode 100644 index 000000000..dedd5c84a --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TypeCasting.h + * + * \brief: + * TypeCasting + * +*****************************************************************/ + +#ifndef EIGEN_TYPE_CASTING_SYCL_H +#define EIGEN_TYPE_CASTING_SYCL_H + +namespace Eigen { + +namespace internal { +#ifdef __SYCL_DEVICE_ONLY__ +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast(const cl::sycl::cl_float4& a) { + return a. template convert(); +} + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast(const cl::sycl::cl_int4& a) { + return a. template convert(); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast(const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) { + auto a1=a. template convert(); + auto b1=b. template convert(); + return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast(const cl::sycl::cl_float4& a) { + // Simply discard the second half of the input + return cl::sycl::cl_double2(a.x(), a.y()); +} + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SYCL_H From b512a9536f4b6260fd7af1d39f337eea8c6932cb Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 13:39:13 +0100 Subject: [PATCH 22/24] Enabling per device specialisation of packetsize. --- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 2 +- .../CXX11/src/Tensor/TensorBroadcasting.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 4 ++-- .../CXX11/src/Tensor/TensorConcatenation.h | 4 ++-- .../CXX11/src/Tensor/TensorContraction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 2 +- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 8 +++---- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 24 ++++++++++--------- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorScan.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 8 +++---- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 4 ++-- .../CXX11/src/Tensor/TensorVolumePatch.h | 2 +- 23 files changed, 48 insertions(+), 46 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 199ddb123..f1f877c16 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -98,7 +98,7 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; static const int NumDims = XprType::NumDims; enum { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 8fecbe657..b4a77b022 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -104,7 +104,7 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; bool isCopy= false, nByOne = false, oneByN = false; enum { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 085c05f3d..3ab0a0f49 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -138,7 +138,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { @@ -339,7 +339,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 9f0321880..27c92d8f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -251,7 +251,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); @@ -354,7 +354,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 86602c27e..e1649fb47 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -239,7 +239,7 @@ struct TensorContractionEvaluatorBase enum { IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index e0cbbb315..a7751eee1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -190,7 +190,7 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 1ec5819a7..0d3ca966c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -302,7 +302,7 @@ struct TensorEvaluator::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 0e4db46de..571922073 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -87,11 +87,11 @@ struct TensorEvaluator, Devi typedef typename internal::remove_const::type Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -249,11 +249,11 @@ struct TensorEvaluator::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index af39daa91..256d499f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -102,7 +102,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = TensorEvaluator::IsAligned, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f9a1bd68c..8f7a81575 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -33,6 +33,7 @@ struct TensorEvaluator typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; typedef Derived XprType; + static const int PacketSize = PacketType::size; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? @@ -40,7 +41,7 @@ struct TensorEvaluator enum { IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, @@ -121,7 +122,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( @@ -188,10 +189,11 @@ struct TensorEvaluator // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; + static const int PacketSize = PacketType::size; enum { IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = internal::is_arithmetic::type>::value, Layout = Derived::Layout, CoordAccess = NumCoords > 0, @@ -249,7 +251,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void getResourceRequirements( @@ -300,7 +302,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -322,7 +324,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); + PacketType::size); } EIGEN_DEVICE_FUNC typename Eigen::internal::traits::PointerType data() const { return NULL; } @@ -367,7 +369,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } @@ -445,7 +447,7 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size< @@ -574,7 +576,7 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -644,7 +646,7 @@ struct TensorEvaluator enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, + PacketType::HasBlend, BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -665,7 +667,7 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; typedef typename TensorEvaluator::Dimensions Dimensions; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index fdb31928f..5a6555cde 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -93,11 +93,11 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = true, - PacketAccess = (PacketSize > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index fa269b8c6..97c8d4a02 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -90,7 +90,7 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; enum { IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -137,7 +137,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 72cb2d15f..00e1186e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -181,7 +181,7 @@ struct TensorEvaluator, Device> typedef TensorEvaluator Impl; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 6147fbdf1..64f2ad81f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -85,7 +85,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 498488649..9a6431f29 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -423,7 +423,7 @@ struct TensorEvaluator, Devi template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); @@ -584,7 +584,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = internal::unpacket_traits::size; + const int packetSize = PacketType::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index ffa22f31e..aa1db3c73 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -91,7 +91,7 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = true, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 950ac32af..a0a1ad8f4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -88,7 +88,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 375fc0802..bc09d3699 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -407,7 +407,7 @@ struct TensorEvaluator, static const bool InputPacketAccess = TensorEvaluator::PacketAccess; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index bb2768ab1..9193bdd8e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -108,7 +108,7 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, @@ -266,7 +266,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 39717efaa..b1135f297 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -95,7 +95,7 @@ struct TensorEvaluator, Device> { enum { IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 6b54f40ad..0fc49255d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -107,11 +107,11 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented @@ -236,11 +236,11 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketType::size > 1), BlockAccess = false, RawAccess = false }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index c09513c10..4b69072f2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -107,7 +107,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/false, @@ -287,7 +287,7 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index ef199bfb6..3c7d8bbc0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -194,7 +194,7 @@ struct TensorEvaluator, D typedef typename internal::remove_const::type Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; + static const int PacketSize = PacketType::size; enum { IsAligned = false, From af96018b499be64ff0b262cafc7b31f1a907b4c8 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 16:04:44 +0100 Subject: [PATCH 23/24] Using the suggested modification. --- Eigen/src/Core/util/Macros.h | 21 +++++-------------- .../Eigen/CXX11/src/Tensor/TensorMap.h | 4 ++-- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 3255b8351..f59b93608 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1077,26 +1077,15 @@ namespace Eigen { #endif #ifdef EIGEN_HAS_VARIADIC_TEMPLATES -// Provide a variadic version of assert which can take a parameter pack as its input -// The eigen_assert macro used here might have been redefined to use other macros such as EIGEN_THROW, such as used in Eigen's test suite, therefore this needs to be defined after the other macros. -// Note that this does not provide as nice a string to assert as a straight forward call to eigen_assert, so we add a message to the assert. -#if defined(EIGEN_NO_DEBUG) -#define eigen_variadic_assert(x) -#else +// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. namespace Eigen { namespace internal { -inline void variadic_assert(const char*) {} -template inline void variadic_assert(const char* message, bool first, Bools ... others) { - eigen_assert(first && message); - variadic_assert(message, others...); - EIGEN_UNUSED_VARIABLE(first); +bool all(){ return true; } +template +bool all(T t, Ts ... ts){ return t && all(ts...); } + } } -} -#define EIGEN_VARIADIC_ASSERT_MESSAGE(x) EIGEN_MAKESTRING(x) " in " __FILE__ ":" EIGEN_MAKESTRING(__LINE__) -#define eigen_variadic_assert(x) \ - do { Eigen::internal::variadic_assert(EIGEN_VARIADIC_ASSERT_MESSAGE(x), x); } while(false); -#endif #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index f69f8f24a..d1cc0593f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -150,7 +150,7 @@ template class MakePoin EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_variadic_assert((Eigen::NumTraits::highest() >= otherIndices)...); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; @@ -238,7 +238,7 @@ template class MakePoin EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - eigen_variadic_assert((Eigen::NumTraits::highest() >= otherIndices)...); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); From c6a5c70712851cd696d7410579506fc299c04a05 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Aug 2018 16:56:26 +0100 Subject: [PATCH 24/24] Correcting the position of allocate_temp/deallocate_temp in TensorDeviceGpu.h --- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 48bbd5e45..b490433db 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -159,13 +159,6 @@ class GpuStreamDevice : public StreamInterface { err = gpuFree(buffer); gpu_assert(err == gpuSuccess); } - EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - stream_->deallocate(buffer); - } virtual void* scratchpad() const { if (scratch_ == NULL) { @@ -214,6 +207,15 @@ struct GpuDevice { stream_->deallocate(buffer); } + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
Example:Output: