From f8fbb3f9a642371d3798e8d6e9638d878e00d945 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Feb 2015 22:22:37 -0800 Subject: [PATCH 01/18] Fixed several compilation warnings reported by clang --- .../CXX11/src/Tensor/TensorBroadcasting.h | 4 ++-- .../src/Tensor/TensorContractionThreadPool.h | 22 +++++++++---------- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 1 - .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 2 +- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 5790e19d6..055a7d407 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -249,7 +249,7 @@ struct TensorEvaluator, Device> innermostLoc = index; } else { if (internal::index_statically_eq()(0, 1)) { - eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + eigen_assert(index % m_impl.dimensions()[0] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[0]; @@ -302,7 +302,7 @@ struct TensorEvaluator, Device> innermostLoc = index; } else { if (internal::index_statically_eq()(NumDims-1, 1)) { - eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[NumDims-1]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 8b87f1045..9259c864e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -174,8 +174,6 @@ struct TensorEvaluatorm_device.numThreads(); Index mc = m; @@ -190,8 +188,8 @@ struct TensorEvaluator kernel_promises(num_kernel_promises); std::vector kernel_futures(num_kernel_promises); - for (int i = 0; i < kernel_promises.size(); ++i) { + for (std::size_t i = 0; i < kernel_promises.size(); ++i) { kernel_promises[i].set_value(); kernel_futures[i] = kernel_promises[i].get_future(); } @@ -239,16 +237,16 @@ struct TensorEvaluator 0); - int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; for (int i = 0; i < n_blocks; ++i) { - int future_id = (blockAId * n_blocks + i); + Index future_id = (blockAId * n_blocks + i); wait_until_ready(&kernel_futures[future_id]); kernel_promises[future_id] = Promise(); kernel_futures[future_id] = kernel_promises[future_id].get_future(); @@ -277,9 +275,9 @@ struct TensorEvaluator { } EIGEN_DEVICE_FUNC DSizes() { - for (int i = 0 ; i < NumDims; ++i) { + for (std::size_t i = 0 ; i < NumDims; ++i) { (*this)[i] = 0; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 05ac9bd2f..1eb37bf1c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -131,7 +131,6 @@ class TensorExecutor const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - Index i = 0; std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a93f48ccb..01ba0a80f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -302,7 +302,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (int i = 0; i < internal::array_size::value; ++i) { + for (std::size_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } From 27f3fb2bcc8ea06f3f471560fc1c87210cd0c62b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Feb 2015 22:54:20 -0800 Subject: [PATCH 02/18] Fixed another clang warning --- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 2714117ab..4b22c64de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -28,6 +28,21 @@ namespace Eigen { namespace internal { +namespace { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) + { +#ifndef __CUDA_ARCH__ + if (sizeof(T) == 8) { + return __builtin_clzl(val); + } + return __builtin_clz(val); +#else + return __clz(val); +#endif + } +}; + template struct TensorIntDivisor { public: @@ -44,11 +59,7 @@ struct TensorIntDivisor { eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 -#ifndef __CUDA_ARCH__ - const int leading_zeros = __builtin_clz(divider); -#else - const int leading_zeros = __clz(divider); -#endif + const int leading_zeros = count_leading_zeros(divider); const int log_div = N - (leading_zeros+1); multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; From bffb6bdf455097ac5a6f98eb4bb549166e318989 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 25 Feb 2015 23:54:43 -0800 Subject: [PATCH 03/18] Made TensorIntDiv.h compile with MSVC --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 4b22c64de..535abd6fd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -29,14 +29,16 @@ namespace Eigen { namespace internal { namespace { + // Note: result is undefined if val == 0 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) { #ifndef __CUDA_ARCH__ - if (sizeof(T) == 8) { - return __builtin_clzl(val); - } - return __builtin_clz(val); + return __builtin_clz(static_cast(val)); +#elif EIGEN_COMP_MSVC + DWORD leading_zero = 0; + _BitScanReverse( &leading_zero, value); + return 31 - leading_zero; #else return __clz(val); #endif From 400becc591359596b1236ee8d9f2c99c12961a42 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 15:57:06 +0100 Subject: [PATCH 04/18] Fix typos in block-size testing code, and set peeling on k to 8. --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0890bc690..eb095966b 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES EIGEN_UNUSED_VARIABLE(num_threads); enum { - kr = 16, + kr = 8, mr = Traits::mr, nr = Traits::nr }; k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); if (k > kr) k -= k % kr; - m = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); if (m > mr) m -= m % mr; - n = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); if (n > nr) n -= n % nr; return; #endif From a8ad8887bf80869eb216f6f937a559290984a3f7 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 16:04:35 +0100 Subject: [PATCH 05/18] Implement a more generic blocking-size selection algorithm. See explanations inlines. It performs extremely well on Haswell. The main issue is to reliably and quickly find the actual cache size to be used for our 2nd level of blocking, that is: max(l2,l3/nb_core_sharing_l3) --- .../Core/products/GeneralBlockPanelKernel.h | 88 ++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index eb095966b..65310e637 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -155,9 +155,91 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // In unit tests we do not want to use extra large matrices, // so we reduce the block size to check the blocking strategy is not flawed #ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); + // Early return for small problems because the computation below are time consuming for small problems. + // Perhaps it would make more sense to consider k*n*m?? + // Note that for very tiny problem, this function should be bypassed anyway + // because we use the coefficient-based implementation for them. + if(std::max(k,std::max(m,n))<48) + return; + + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + + // ---- 1st level of blocking on L1, yields kc ---- + + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel + // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. + // We also include a register-level block of the result (mx x nr). + // (In an ideal world only the lhs panel would stay in L1) + // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: + const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1)); + const Index old_k = k; + if(k>max_kc) + { + // We are really blocking on the third dimension: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the result. + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- + + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: + // actual_l2 = max(l2, l3/nb_core_sharing_l3) + // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) + // For instance, it corresponds to 6MB of L3 shared among 4 cores. + const Index actual_l2 = 1572864; // == 1.5 MB + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. + // The second half is implicitly reserved to access the result and lhs coefficients. + // When k(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + // We are really blocking over the columns: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the packed lhs. + // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + // So far, no blocking at all, i.e., kc==k, and nc==n. + // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + // problem is small enough to keep in L1 + // Let's choose m such that lhs's block fit in 1/3 of L1 + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + // we have both L2 and L3, and problem is small enough to be kept in L2 + // Let's choose m such that lhs's block fit in 1/3 of L2 + actual_lm = l2; + max_mc = 576; + } + + Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); + } #else k = std::min(k,24); n = std::min(n,384/sizeof(RhsScalar)); From a46061ab7bf290bd918716e56bf365d6939ff081 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 16:12:58 +0100 Subject: [PATCH 06/18] Make perf_monitoring/gemm script more flexible: - skip existing dataset - add a "-up" option to recompute the dataset (see script header) - allow to specify a filename prefix --- bench/perf_monitoring/gemm/run_gemm.sh | 97 +++++++++++++++++++++----- 1 file changed, 78 insertions(+), 19 deletions(-) diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh index 4d87ff656..d3a9fadc9 100755 --- a/bench/perf_monitoring/gemm/run_gemm.sh +++ b/bench/perf_monitoring/gemm/run_gemm.sh @@ -1,5 +1,25 @@ #!/bin/bash +# Examples of environment variables to be set: +# PREFIX="haswell-fma-" +# CXX_FLAGS="-mfma" + +# Options: +# -up : enforce the recomputation of existing data, and keep best results as a merging strategy + + +if echo "$*" | grep '\-up' > /dev/null; then + update=true +else + update=false +fi + +if [ $update == true ]; then + echo "(Re-)Compute all changesets and keep bests" +else + echo "Skip previously computed changesets" +fi + if [ ! -d "eigen_src" ]; then hg clone https://bitbucket.org/eigen/eigen eigen_src fi @@ -8,9 +28,32 @@ if [ ! -z '$CXX' ]; then CXX=g++ fi -rm sgemm.out -rm dgemm.out -rm cgemm.out +function make_backup +{ + if [ -f "$1.out" ]; then + mv "$1.out" "$1.backup" + fi +} + +function merge +{ + count1=`echo $1 | wc -w` + count2=`echo $2 | wc -w` + + if [ $count1 == $count2 ]; then + a=( $1 ); b=( $2 ) + res="" + for (( i=0 ; i<$count1 ; i++ )); do + ai=${a[$i]}; bi=${b[$i]} + tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l` + res="$res $tmp" + done + echo $res + + else + echo $1 + fi +} function test_current { @@ -18,16 +61,32 @@ function test_current scalar=$2 name=$3 - if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then - res=`./$name` - echo $res - echo "$rev $res" >> $name.out + prev=`grep $rev "$name.backup" | cut -c 14-` + res=$prev + count_rev=`echo $prev | wc -w` + count_ref=`cat "settings.txt" | wc -l` + if [ $update == true ] || [ $count_rev != $count_ref ]; then + if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then + curr=`./$name` + echo merge $prev + echo with $curr + res=`merge "$curr" "$prev"` + echo $res + echo "$rev $res" >> $name.out + else + echo "Compilation failed, skip rev $rev" + fi else - echo "Compilation failed, skip rev $rev" + echo "Skip existing results for $rev / $name" + echo "$rev $res" >> $name.out fi } -while read rev +make_backup $PREFIX"sgemm" +make_backup $PREFIX"dgemm" +make_backup $PREFIX"cgemm" + +cut -f1 -d"#" < changesets.txt | while read rev do if [ ! -z '$rev' ]; then echo "Testing rev $rev" @@ -36,27 +95,27 @@ do actual_rev=`hg identify | cut -f1 -d' '` cd .. - test_current $actual_rev float sgemm - test_current $actual_rev double dgemm - test_current $actual_rev "std::complex" cgemm + test_current $actual_rev float $PREFIX"sgemm" + test_current $actual_rev double $PREFIX"dgemm" + test_current $actual_rev "std::complex" $PREFIX"cgemm" fi -done < changesets.txt +done echo "Float:" -cat sgemm.out +cat $PREFIX"sgemm.out" echo "" echo "Double:" -cat dgemm.out +cat $PREFIX"dgemm.out" echo "" echo "Complex:" -cat cgemm.out +cat $PREFIX"cgemm.out" echo "" -./make_plot.sh sgemm -./make_plot.sh dgemm -./make_plot.sh cgemm +./make_plot.sh $PREFIX"sgemm" +./make_plot.sh $PREFIX"dgemm" +./make_plot.sh $PREFIX"cgemm" From 2e9cb06a87213138618487e62d7893ad3a9b4546 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 16:13:33 +0100 Subject: [PATCH 07/18] Update changeset list to be checked by perf_monitoring/gemm. --- bench/perf_monitoring/gemm/changesets.txt | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index 291fd55b4..f19b4287d 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -18,8 +18,22 @@ before-evaluators 6334:f6a45e5b8b7c 6639:c9121c60b5c7 6655:06f163b5221f -6677:700e023044e7 +6677:700e023044e7 # FMA has been wrongly disabled 6681:11d31dafb0e3 -6844:039efd86b75c -6845:7333ed40c6ef -6911:6192dd812d84 \ No newline at end of file +6699:5e6e8e10aad1 # merge default to tensors +6726:ff2d2388e7b9 # merge default to tensors +6742:0cbd6195e829 # merge default to tensors +6747:853d2bafeb8f # Generalized the gebp apis +6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation +6781:9cc5a931b2c6 # generalized gemv +6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product +6844:039efd86b75c # merge tensor +6845:7333ed40c6ef # change prefetching in gebp +6856:b5be5e10eb7f # merge index conversion +6893:c3a64aba7c70 # clean blocking size computation +6898:6fb31ebe6492 # rotating kernel for ARM +6899:877facace746 # rotating kernel for ARM only +6904:c250623ae9fa # result_of +6921:915f1b1fc158 # fix prefetching change for ARM +6923:9ff25f6dacc6 # prefetching +6933:52572e60b5d3 # blocking size strategy \ No newline at end of file From 4ec3f04b3ab42a595b99bb2902758f7d748ab80c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 17:00:36 +0100 Subject: [PATCH 08/18] Make sure that the block size computation is tested by our unit test. --- .../Core/products/GeneralBlockPanelKernel.h | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 65310e637..06c7f362e 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -153,8 +153,13 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads } else { // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed -#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + // so we reduce the cache size to check the blocking strategy is not flawed +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 4*1024; + l2 = 32*1024; + l3 = 512*1024; +#endif + // Early return for small problems because the computation below are time consuming for small problems. // Perhaps it would make more sense to consider k*n*m?? // Note that for very tiny problem, this function should be bypassed anyway @@ -195,7 +200,13 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // actual_l2 = max(l2, l3/nb_core_sharing_l3) // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) // For instance, it corresponds to 6MB of L3 shared among 4 cores. + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else const Index actual_l2 = 1572864; // == 1.5 MB + #endif + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. @@ -240,11 +251,6 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads m = (m%mc)==0 ? mc : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); } -#else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); -#endif } } From bcf9bb5c1f1834487f10721abc909be4a62574d1 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 26 Feb 2015 17:01:33 +0100 Subject: [PATCH 09/18] Avoid packing rhs multiple-times when blocking on the lhs only. --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index c38c12c31..c76f48154 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; // For each horizontal panel of the rhs, and corresponding panel of the lhs... for(Index i2=0; i2 Date: Thu, 26 Feb 2015 09:27:53 -0800 Subject: [PATCH 10/18] Added missing copy constructor --- unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index c119b30e2..054ecf7b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -85,6 +85,15 @@ class TensorLayoutSwapOp : public TensorBase, WriteA const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) From 57154fdb32a4853bff458f8014b037d5e41b9858 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 26 Feb 2015 11:13:42 -0800 Subject: [PATCH 11/18] Can now use the tensor 'reverse' operation as a lvalue --- .../Eigen/CXX11/src/Tensor/TensorBase.h | 5 + .../Eigen/CXX11/src/Tensor/TensorReverse.h | 93 ++++++++++++++++--- unsupported/test/cxx11_tensor_reverse.cpp | 35 +++++-- 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index cfcf18e8e..201b0fc9e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -549,6 +549,11 @@ class TensorBase : public TensorBase(derived(), offset, dim); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index ad21e966b..16bef2ad3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -49,12 +49,9 @@ struct nested, 1, } // end namespace internal - - - template class TensorReverseOp : public TensorBase, ReadOnlyAccessors> + XprType>, WriteAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -67,8 +64,8 @@ class TensorReverseOp : public TensorBase::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, - const ReverseDimensions& reverse_dims) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( + const XprType& expr, const ReverseDimensions& reverse_dims) : m_xpr(expr), m_reverse_dims(reverse_dims) {} EIGEN_DEVICE_FUNC @@ -78,12 +75,30 @@ class TensorReverseOp : public TensorBase::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + protected: typename XprType::Nested m_xpr; const ReverseDimensions m_reverse_dims; }; - // Eval as rvalue template struct TensorEvaluator, Device> @@ -134,8 +149,8 @@ struct TensorEvaluator, Device m_impl.cleanup(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( + Index index) const { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -152,7 +167,6 @@ struct TensorEvaluator, Device } else { inputIndex += index; } - return m_impl.coeff(inputIndex); } else { for (int i = 0; i < NumDims - 1; ++i) { Index idx = index / m_strides[i]; @@ -167,8 +181,13 @@ struct TensorEvaluator, Device } else { inputIndex += index; } - return m_impl.coeff(inputIndex); } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( + Index index) const { + return m_impl.coeff(reverseIndex(index)); } template @@ -199,9 +218,57 @@ struct TensorEvaluator, Device ReverseDimensions m_reverse; }; +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, + Device> { + typedef TensorEvaluator, + Device> Base; + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_impl.coeffRef(this->reverseIndex(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + +}; - -} // end namespace Eigen +} // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index 4c0be35da..f96c21fa3 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -94,7 +94,7 @@ static void test_simple_reverse() template -static void test_expr_reverse() +static void test_expr_reverse(bool LValue) { Tensor tensor(2,3,5,7); tensor.setRandom(); @@ -105,9 +105,12 @@ static void test_expr_reverse() dim_rev[2] = false; dim_rev[3] = true; - - Tensor expected; - expected = tensor.reverse(dim_rev); + Tensor expected(2, 3, 5, 7); + if (LValue) { + expected.reverse(dim_rev) = tensor; + } else { + expected = tensor.reverse(dim_rev); + } Tensor result(2,3,5,7); @@ -117,8 +120,13 @@ static void test_expr_reverse() array dst_slice_start{{0,0,0,0}}; for (int i = 0; i < 5; ++i) { - result.slice(dst_slice_start, dst_slice_dim) = - tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + if (LValue) { + result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) = + tensor.slice(src_slice_start, src_slice_dim); + } else { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } src_slice_start[2] += 1; dst_slice_start[2] += 1; } @@ -141,8 +149,13 @@ static void test_expr_reverse() dst_slice_start[2] = 0; result.setRandom(); for (int i = 0; i < 5; ++i) { - result.slice(dst_slice_start, dst_slice_dim) = - tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + if (LValue) { + result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) = + tensor.slice(dst_slice_start, dst_slice_dim); + } else { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } dst_slice_start[2] += 1; } @@ -162,6 +175,8 @@ void test_cxx11_tensor_reverse() { CALL_SUBTEST(test_simple_reverse()); CALL_SUBTEST(test_simple_reverse()); - CALL_SUBTEST(test_expr_reverse()); - CALL_SUBTEST(test_expr_reverse()); + CALL_SUBTEST(test_expr_reverse(true)); + CALL_SUBTEST(test_expr_reverse(true)); + CALL_SUBTEST(test_expr_reverse(false)); + CALL_SUBTEST(test_expr_reverse(false)); } From f074bb4b5f6ddf0a67348365499a5c5c335a5e17 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 26 Feb 2015 11:14:23 -0800 Subject: [PATCH 12/18] Fixed another compilation problem with TensorIntDiv.h --- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 535abd6fd..11c7ce443 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -33,17 +33,17 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) { -#ifndef __CUDA_ARCH__ - return __builtin_clz(static_cast(val)); +#ifdef __CUDA_ARCH__ + return __clz(val); #elif EIGEN_COMP_MSVC DWORD leading_zero = 0; _BitScanReverse( &leading_zero, value); return 31 - leading_zero; #else - return __clz(val); + return __builtin_clz(static_cast(val)); #endif } -}; +} template struct TensorIntDivisor { From b7fc8746e07556404672b4da2d53dd323c441634 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 27 Feb 2015 10:01:59 -0500 Subject: [PATCH 13/18] Replace a static assert by a runtime one, fixes the build of unit tests on ARM Also safely assert in the non-implemented path that should never be taken in practice, and would return wrong results. --- Eigen/src/Core/GenericPacketMath.h | 7 +++++-- Eigen/src/Core/util/StaticAssert.h | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 967a07df5..afdd6f71e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -290,7 +290,11 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& template struct protate_impl { - static Packet run(const Packet& a) { return a; } + static Packet run(const Packet& a) + { + eigen_assert(false && "unimplemented"); + return a; + } }; /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, @@ -299,7 +303,6 @@ struct protate_impl */ template EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) { - EIGEN_STATIC_ASSERT(offset < unpacket_traits::size, ROTATION_BY_ILLEGAL_OFFSET); return offset ? protate_impl::run(a) : a; } diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 5e16b775b..7538a0633 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -93,8 +93,7 @@ THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - ROTATION_BY_ILLEGAL_OFFSET + STORAGE_LAYOUT_DOES_NOT_MATCH }; }; From f5ff4d826fa5728f6319dcc7159b1337a2461880 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 27 Feb 2015 10:56:50 -0500 Subject: [PATCH 14/18] Fix NEON build flags: in the current NDK, at least with the clang-3.5 toolchain, -mfpu=neon is not enough to activate NEON, since it's incompatible with the default float ABI, and I have to pass -mfloat-abi=softfp (which is what everyone does in practice). In fact, it would be a good idea to pass -mfloat-abi=softfp all the time, regardless of NEON. Also removing the -mcpu=cortex-a8, as 1) it's not needed and 2) if we really wanted to pass a specific -mcpu flag, that would presumably to tune performance for benchmarks, and it would then not really make sense to tune for the very old cortex-a8 (it reflects ARM CPUs from 5 years ago). --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 13f9c8f9d..2c1ae428e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,7 @@ if(NOT MSVC) option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp") message(STATUS "Enabling NEON in tests/examples") endif() From 33669348c490372e598f839754257f3710195337 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 27 Feb 2015 11:35:37 -0500 Subject: [PATCH 15/18] Disable Packet2f/2i halfpacket support in NEON. I believe that it was erroneously turned on, since Packet2f/2i intrinsics are unimplemented, and code trying to use halfpackets just fails to compile on NEON, as it tries to use the default implementation of pload/pstore and the types don't match. --- Eigen/src/Core/arch/NEON/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e9af45f22..12198d79c 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -76,12 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet2f half; + typedef Packet4f half; // Packet2f intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=1, + HasHalfPacket=0, // Packet2f intrinsics not implemented yet HasDiv = 1, // FIXME check the Has* @@ -95,12 +95,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; - typedef Packet2i half; + typedef Packet4i half; // Packet2i intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, - HasHalfPacket=1 + HasHalfPacket=0, // Packet2i intrinsics not implemented yet // FIXME check the Has* }; }; From 2fc3b484d71486bad6a43488a182fa7945d65f45 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 27 Feb 2015 11:37:45 -0500 Subject: [PATCH 16/18] remove trailing comma --- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 12198d79c..8dd1e1370 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -100,7 +100,7 @@ template<> struct packet_traits : default_packet_traits Vectorizable = 1, AlignedOnScalar = 1, size=4, - HasHalfPacket=0, // Packet2i intrinsics not implemented yet + HasHalfPacket=0 // Packet2i intrinsics not implemented yet // FIXME check the Has* }; }; From 90f4e90f1d8ff6efd3ee695cbb25b3d6d6ae4fee Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 27 Feb 2015 09:22:37 -0800 Subject: [PATCH 17/18] Fixed off-by-one error that prevented the evaluation of small tensor expressions from being vectorized --- unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 1eb37bf1c..bb2f8b977 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -97,7 +97,7 @@ struct EvalRange { Index i = first; static const int PacketSize = unpacket_traits::size; - if (last - first > PacketSize) { + if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { From 6466fa63be51b32c493401bb7cf6aa4b2e1ef385 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 27 Feb 2015 15:30:10 -0500 Subject: [PATCH 18/18] Reimplement the selection between rotating and non-rotating kernels using templates instead of macros and if()'s. That was needed to fix the build of unit tests on ARM, which I had broken. My bad for not testing earlier. --- Eigen/src/Core/GenericPacketMath.h | 7 +- .../Core/products/GeneralBlockPanelKernel.h | 155 +++++++++++------- 2 files changed, 95 insertions(+), 67 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index afdd6f71e..e48c064ce 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -290,11 +290,8 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& template struct protate_impl { - static Packet run(const Packet& a) - { - eigen_assert(false && "unimplemented"); - return a; - } + // Empty so attempts to use this unimplemented path will fail to compile. + // Only specializations of this template should be used. }; /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 06c7f362e..9061fd936 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -800,6 +800,80 @@ protected: conj_helper cj; }; +// helper for the rotating kernel below +template +struct PossiblyRotatingKernelHelper +{ + // default implementation, not rotating + + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + + template + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to); + } + + void unrotateResult(AccPacket&, + AccPacket&, + AccPacket&, + AccPacket&) + { + } +}; + +// rotating implementation +template +struct PossiblyRotatingKernelHelper +{ + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + template + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + if (Index == 0) { + to = pload(from + 4*K*Traits::RhsProgress); + } else { + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); + to = protate<1>(to); + } + } + + void unrotateResult(AccPacket& res0, + AccPacket& res1, + AccPacket& res2, + AccPacket& res3) + { + PacketBlock resblock; + resblock.packet[0] = res0; + resblock.packet[1] = res1; + resblock.packet[2] = res2; + resblock.packet[3] = res3; + ptranspose(resblock); + resblock.packet[3] = protate<1>(resblock.packet[3]); + resblock.packet[2] = protate<2>(resblock.packet[2]); + resblock.packet[1] = protate<3>(resblock.packet[1]); + ptranspose(resblock); + res0 = resblock.packet[0]; + res1 = resblock.packet[1]; + res2 = resblock.packet[2]; + res3 = resblock.packet[3]; + } +}; + /* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B @@ -833,6 +907,16 @@ struct gebp_kernel ResPacketSize = Traits::ResPacketSize }; + + static const bool UseRotatingKernel = + EIGEN_ARCH_ARM && + internal::is_same::value && + internal::is_same::value && + internal::is_same::value && + Traits::LhsPacketSize == 4 && + Traits::RhsPacketSize == 4 && + Traits::ResPacketSize == 4; + EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, @@ -866,6 +950,8 @@ void gebp_kernel=3*Traits::LhsProgress) { + PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) for(Index i=0; i(&blB[(0+4*K)*RhsProgress]); \ - } else { \ - EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ - B_0 = protate<1>(B_0); \ - } \ - } else { \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \ - } \ - } while (false) -#else -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) -#endif - #define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ @@ -947,19 +1002,19 @@ void gebp_kernel(B_0, blB); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ @@ -992,34 +1047,10 @@ void gebp_kernel resblock; \ - resblock.packet[0] = res0; \ - resblock.packet[1] = res1; \ - resblock.packet[2] = res2; \ - resblock.packet[3] = res3; \ - ptranspose(resblock); \ - resblock.packet[3] = protate<1>(resblock.packet[3]); \ - resblock.packet[2] = protate<2>(resblock.packet[2]); \ - resblock.packet[1] = protate<3>(resblock.packet[1]); \ - ptranspose(resblock); \ - res0 = resblock.packet[0]; \ - res1 = resblock.packet[1]; \ - res2 = resblock.packet[2]; \ - res3 = resblock.packet[3]; \ - } while (false) - - EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); - EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); - EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); - } -#endif + possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3); + possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7); + possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11); ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha);