From a1f1e1e51df316d1b37733770f5e7ab17006113a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 6 Apr 2015 10:41:39 -0700 Subject: [PATCH 1/7] Fixed the order of 2 #includes --- unsupported/Eigen/CXX11/Tensor | 2 +- unsupported/test/cxx11_tensor_index_list.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 200bcf966..ae6c3fe7e 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -49,8 +49,8 @@ #include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index c4d4f244f..4ce5add32 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -255,6 +255,17 @@ static void test_mixed_index_list() VERIFY_IS_APPROX(result3(0), expected); } + +static void test_dim_check() +{ + Eigen::IndexList, int> dim1; + dim1.set(1, 2); + Eigen::IndexList, int> dim2; + dim2.set(1, 2); + VERIFY(dimensions_match(dim1, dim2)); +} + + #endif void test_cxx11_tensor_index_list() @@ -264,5 +275,6 @@ void test_cxx11_tensor_index_list() CALL_SUBTEST(test_type2index_list()); CALL_SUBTEST(test_dynamic_index_list()); CALL_SUBTEST(test_mixed_index_list()); + CALL_SUBTEST(test_dim_check()); #endif } From 1de49ef4c2d96acc1c96628fa52e2330cf54dc19 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 7 Apr 2015 10:44:13 -0700 Subject: [PATCH 2/7] Fixed a bug when chipping tensors laid out in row major order. --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 6 ++- unsupported/test/cxx11_tensor_chipping.cpp | 52 ++++++++++++++----- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index dc9586cbc..3b99ef069 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -157,6 +157,8 @@ struct TensorEvaluator, Device> eigen_assert(NumInputDims > m_dim.actualDim()); const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + int j = 0; for (int i = 0; i < NumInputDims; ++i) { if (i != m_dim.actualDim()) { @@ -246,7 +248,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { Scalar* result = m_impl.data(); - if (m_dim.actualDim() == NumDims && result) { + if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && + result) { return result + m_inputOffset; } else { return NULL; diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index d83417872..bfc2bad18 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -340,11 +340,9 @@ static void test_chip_as_lvalue() } } - -template -static void test_chip_raw_data() +static void test_chip_raw_data_col_major() { - Tensor tensor(2,3,5,7,11); + Tensor tensor(2,3,5,7,11); tensor.setRandom(); typedef TensorEvaluator(3)), DefaultDevice> Evaluator4; @@ -353,12 +351,7 @@ static void test_chip_raw_data() for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { for (int l = 0; l < 7; ++l) { - int chip_index; - if (DataLayout == ColMajor) { - chip_index = i + 2 * (j + 3 * (k + 5 * l)); - } else { - chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i))); - } + int chip_index = i + 2 * (j + 3 * (k + 5 * l)); VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3)); } } @@ -382,6 +375,41 @@ static void test_chip_raw_data() VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); } +static void test_chip_raw_data_row_major() +{ + Tensor tensor(11,7,5,3,2); + tensor.setRandom(); + + typedef TensorEvaluator(3)), DefaultDevice> Evaluator0; + auto chip = Evaluator0(tensor.template chip<0>(3), DefaultDevice()); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 5; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 2; ++l) { + int chip_index = l + 2 * (k + 3 * (j + 5 * i)); + VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l)); + } + } + } + } + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator1; + auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip1.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator2; + auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip2.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator3; + auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip3.data(), static_cast(0)); + + typedef TensorEvaluator(0)), DefaultDevice> Evaluator4; + auto chip4 = Evaluator4(tensor.template chip<4>(0), DefaultDevice()); + VERIFY_IS_EQUAL(chip4.data(), static_cast(0)); +} + void test_cxx11_tensor_chipping() { CALL_SUBTEST(test_simple_chip()); @@ -392,6 +420,6 @@ void test_cxx11_tensor_chipping() CALL_SUBTEST(test_chip_in_expr()); CALL_SUBTEST(test_chip_as_lvalue()); CALL_SUBTEST(test_chip_as_lvalue()); - CALL_SUBTEST(test_chip_raw_data()); - CALL_SUBTEST(test_chip_raw_data()); + CALL_SUBTEST(test_chip_raw_data_col_major()); + CALL_SUBTEST(test_chip_raw_data_row_major()); } From 0e9753c8dfffd8e20a445d3f150b8180c0815419 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 7 Apr 2015 14:03:21 -0400 Subject: [PATCH 3/7] Fix compiler flags on Android/ARM: - generate position-independent code (PIE), a requirement to run binaries on Android 5.0+ devices; - correctly handle EIGEN_TEST_FMA + EIGEN_TEST_NEON to pass -mfpu=neon-vfpv4. --- CMakeLists.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c1ae428e..a28ad07d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,6 +168,11 @@ if(NOT MSVC) else() ei_add_cxx_compiler_flag("-ansi") endif() + + if(ANDROID_NDK) + ei_add_cxx_compiler_flag("-pie") + ei_add_cxx_compiler_flag("-fPIE") + endif() set(CMAKE_REQUIRED_FLAGS "") @@ -208,7 +213,7 @@ if(NOT MSVC) endif() option(EIGEN_TEST_FMA "Enable/Disable FMA in tests/examples" OFF) - if(EIGEN_TEST_FMA) + if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") message(STATUS "Enabling FMA in tests/examples") endif() @@ -227,7 +232,12 @@ if(NOT MSVC) option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp") + if(EIGEN_TEST_FMA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp") message(STATUS "Enabling NEON in tests/examples") endif() From d7f51feb0773cc3843ea8c29c605d4eea4bda4ac Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 7 Apr 2015 15:13:55 -0400 Subject: [PATCH 4/7] bug #992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 428527820..2b4c1242f 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -380,11 +380,12 @@ public: nr = 4, // register block size along the M direction (currently, this one cannot be modified) + default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers - mr = 3*LhsPacketSize, + mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else - mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, + mr = default_mr, #endif LhsProgress = LhsPacketSize, From 0eb220c00d9773c29c7d169ad0e20745b0ef21bb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 8 Apr 2015 09:25:34 +0200 Subject: [PATCH 5/7] add a note on bug #992 --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 2b4c1242f..24623963b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -383,6 +383,8 @@ public: default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) // we assume 16 registers + // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, + // then using 3*LhsPacketSize triggers non-implemented paths in syrk. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, From 5401fbcc50747583b0d47e195f23f988f7dfac5e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 9 Apr 2015 16:44:10 -0700 Subject: [PATCH 6/7] Improved the blocking strategy to speedup multithreaded tensor contractions. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 24623963b..320f96a39 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -112,14 +112,18 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n nr = Traits::nr, nr_mask = (0xffffffff/nr)*nr }; - Index k_cache = (l1-ksub)/kdiv; + // Increasing k gives us more time to prefetch the content of the "C" + // registers. However once the latency is hidden there is no point in + // increasing the value of k, so we'll cap it at 320 (value determined + // experimentally). + const Index k_cache = (std::min)((l1-ksub)/kdiv, 320); if (k_cache < k) { k = k_cache & k_mask; eigen_internal_assert(k > 0); } - Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); - Index n_per_thread = numext::div_ceil(n, num_threads); + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); if (n_cache <= n_per_thread) { // Don't exceed the capacity of the l2 cache. eigen_internal_assert(n_cache >= static_cast(nr)); @@ -131,8 +135,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n if (l3 > l2) { // l3 is shared between all cores, so we'll give each thread its own chunk of l3. - Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); - Index m_per_thread = numext::div_ceil(m, num_threads); + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); if(m_cache < m_per_thread && m_cache >= static_cast(mr)) { m = m_cache & mr_mask; eigen_internal_assert(m > 0); From 761691f18d59790fde24174503c6bdf3d254831b Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Mon, 13 Apr 2015 17:15:00 +0200 Subject: [PATCH 7/7] Make conversion from 0 to Scalar explicit (issue reported by Brad Bell) --- Eigen/src/SparseCore/TriangularSolver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h index fd1a55bc6..8872012db 100644 --- a/Eigen/src/SparseCore/TriangularSolver.h +++ b/Eigen/src/SparseCore/TriangularSolver.h @@ -75,7 +75,7 @@ struct sparse_solve_triangular_selector for(Index i=lhs.rows()-1 ; i>=0 ; --i) { Scalar tmp = other.coeff(i,col); - Scalar l_ii = 0; + Scalar l_ii(0); LhsIterator it(lhsEval, i); while(it && it.index()