diff --git a/CMakeLists.txt b/CMakeLists.txt index 13f9c8f9d..2c1ae428e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,7 @@ if(NOT MSVC) option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp") message(STATUS "Enabling NEON in tests/examples") endif() diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 678938c6b..722881215 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -313,7 +313,8 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& template struct protate_impl { - static Packet run(const Packet& a) { return a; } + // Empty so attempts to use this unimplemented path will fail to compile. + // Only specializations of this template should be used. }; /** \internal \returns a packet with the coefficients rotated to the right in little-endian convention, @@ -322,7 +323,6 @@ struct protate_impl */ template EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a) { - EIGEN_STATIC_ASSERT(offset < unpacket_traits::size, ROTATION_BY_ILLEGAL_OFFSET); return offset ? protate_impl::run(a) : a; } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e9af45f22..8dd1e1370 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -76,12 +76,12 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet2f half; + typedef Packet4f half; // Packet2f intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket=1, + HasHalfPacket=0, // Packet2f intrinsics not implemented yet HasDiv = 1, // FIXME check the Has* @@ -95,12 +95,12 @@ template<> struct packet_traits : default_packet_traits template<> struct packet_traits : default_packet_traits { typedef Packet4i type; - typedef Packet2i half; + typedef Packet4i half; // Packet2i intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, - HasHalfPacket=1 + HasHalfPacket=0 // Packet2i intrinsics not implemented yet // FIXME check the Has* }; }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 0890bc690..9061fd936 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES EIGEN_UNUSED_VARIABLE(num_threads); enum { - kr = 16, + kr = 8, mr = Traits::mr, nr = Traits::nr }; k = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); if (k > kr) k -= k % kr; - m = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + m = std::min(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); if (m > mr) m -= m % mr; - n = std::min(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + n = std::min(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); if (n > nr) n -= n % nr; return; #endif @@ -153,16 +153,104 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads } else { // In unit tests we do not want to use extra large matrices, - // so we reduce the block size to check the blocking strategy is not flawed -#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS - k = std::min(k,sizeof(LhsScalar)<=4 ? 360 : 240); - n = std::min(n,3840/sizeof(RhsScalar)); - m = std::min(m,3840/sizeof(RhsScalar)); -#else - k = std::min(k,24); - n = std::min(n,384/sizeof(RhsScalar)); - m = std::min(m,384/sizeof(RhsScalar)); + // so we reduce the cache size to check the blocking strategy is not flawed +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 4*1024; + l2 = 32*1024; + l3 = 512*1024; #endif + + // Early return for small problems because the computation below are time consuming for small problems. + // Perhaps it would make more sense to consider k*n*m?? + // Note that for very tiny problem, this function should be bypassed anyway + // because we use the coefficient-based implementation for them. + if(std::max(k,std::max(m,n))<48) + return; + + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + + // ---- 1st level of blocking on L1, yields kc ---- + + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel + // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. + // We also include a register-level block of the result (mx x nr). + // (In an ideal world only the lhs panel would stay in L1) + // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: + const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1)); + const Index old_k = k; + if(k>max_kc) + { + // We are really blocking on the third dimension: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the result. + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- + + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: + // actual_l2 = max(l2, l3/nb_core_sharing_l3) + // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) + // For instance, it corresponds to 6MB of L3 shared among 4 cores. + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else + const Index actual_l2 = 1572864; // == 1.5 MB + #endif + + + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. + // The second half is implicitly reserved to access the result and lhs coefficients. + // When k(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + // We are really blocking over the columns: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the packed lhs. + // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + // So far, no blocking at all, i.e., kc==k, and nc==n. + // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + // problem is small enough to keep in L1 + // Let's choose m such that lhs's block fit in 1/3 of L1 + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + // we have both L2 and L3, and problem is small enough to be kept in L2 + // Let's choose m such that lhs's block fit in 1/3 of L2 + actual_lm = l2; + max_mc = 576; + } + + Index mc = (std::min)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); + } } } @@ -712,6 +800,80 @@ protected: conj_helper cj; }; +// helper for the rotating kernel below +template +struct PossiblyRotatingKernelHelper +{ + // default implementation, not rotating + + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + + template + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to); + } + + void unrotateResult(AccPacket&, + AccPacket&, + AccPacket&, + AccPacket&) + { + } +}; + +// rotating implementation +template +struct PossiblyRotatingKernelHelper +{ + typedef typename GebpKernel::Traits Traits; + typedef typename Traits::RhsScalar RhsScalar; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::AccPacket AccPacket; + + const Traits& traits; + PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {} + + template + void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const + { + if (Index == 0) { + to = pload(from + 4*K*Traits::RhsProgress); + } else { + EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); + to = protate<1>(to); + } + } + + void unrotateResult(AccPacket& res0, + AccPacket& res1, + AccPacket& res2, + AccPacket& res3) + { + PacketBlock resblock; + resblock.packet[0] = res0; + resblock.packet[1] = res1; + resblock.packet[2] = res2; + resblock.packet[3] = res3; + ptranspose(resblock); + resblock.packet[3] = protate<1>(resblock.packet[3]); + resblock.packet[2] = protate<2>(resblock.packet[2]); + resblock.packet[1] = protate<3>(resblock.packet[1]); + ptranspose(resblock); + res0 = resblock.packet[0]; + res1 = resblock.packet[1]; + res2 = resblock.packet[2]; + res3 = resblock.packet[3]; + } +}; + /* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B @@ -745,6 +907,16 @@ struct gebp_kernel ResPacketSize = Traits::ResPacketSize }; + + static const bool UseRotatingKernel = + EIGEN_ARCH_ARM && + internal::is_same::value && + internal::is_same::value && + internal::is_same::value && + Traits::LhsPacketSize == 4 && + Traits::RhsPacketSize == 4 && + Traits::ResPacketSize == 4; + EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, @@ -778,6 +950,8 @@ void gebp_kernel=3*Traits::LhsProgress) { + PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); + // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) for(Index i=0; i(&blB[(0+4*K)*RhsProgress]); \ - } else { \ - EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \ - B_0 = protate<1>(B_0); \ - } \ - } else { \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \ - } \ - } while (false) -#else -#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \ - EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) -#endif - #define EIGEN_GEBP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ @@ -859,19 +1002,19 @@ void gebp_kernel(B_0, blB); \ traits.madd(A0, B_0, C0, T0); \ traits.madd(A1, B_0, C4, T0); \ traits.madd(A2, B_0, C8, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C1, T0); \ traits.madd(A1, B_0, C5, T0); \ traits.madd(A2, B_0, C9, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C2, T0); \ traits.madd(A1, B_0, C6, T0); \ traits.madd(A2, B_0, C10, B_0); \ - EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \ + possiblyRotatingKernelHelper.template loadOrRotateRhs(B_0, blB); \ traits.madd(A0, B_0, C3 , T0); \ traits.madd(A1, B_0, C7, T0); \ traits.madd(A2, B_0, C11, B_0); \ @@ -904,34 +1047,10 @@ void gebp_kernel resblock; \ - resblock.packet[0] = res0; \ - resblock.packet[1] = res1; \ - resblock.packet[2] = res2; \ - resblock.packet[3] = res3; \ - ptranspose(resblock); \ - resblock.packet[3] = protate<1>(resblock.packet[3]); \ - resblock.packet[2] = protate<2>(resblock.packet[2]); \ - resblock.packet[1] = protate<3>(resblock.packet[1]); \ - ptranspose(resblock); \ - res0 = resblock.packet[0]; \ - res1 = resblock.packet[1]; \ - res2 = resblock.packet[2]; \ - res3 = resblock.packet[3]; \ - } while (false) - - EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3); - EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7); - EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11); - } -#endif + possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3); + possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7); + possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11); ResPacket R0, R1, R2; ResPacket alphav = pset1(alpha); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index c38c12c31..c76f48154 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; // For each horizontal panel of the rhs, and corresponding panel of the lhs... for(Index i2=0; i2 /dev/null; then + update=true +else + update=false +fi + +if [ $update == true ]; then + echo "(Re-)Compute all changesets and keep bests" +else + echo "Skip previously computed changesets" +fi + if [ ! -d "eigen_src" ]; then hg clone https://bitbucket.org/eigen/eigen eigen_src fi @@ -8,9 +28,32 @@ if [ ! -z '$CXX' ]; then CXX=g++ fi -rm sgemm.out -rm dgemm.out -rm cgemm.out +function make_backup +{ + if [ -f "$1.out" ]; then + mv "$1.out" "$1.backup" + fi +} + +function merge +{ + count1=`echo $1 | wc -w` + count2=`echo $2 | wc -w` + + if [ $count1 == $count2 ]; then + a=( $1 ); b=( $2 ) + res="" + for (( i=0 ; i<$count1 ; i++ )); do + ai=${a[$i]}; bi=${b[$i]} + tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l` + res="$res $tmp" + done + echo $res + + else + echo $1 + fi +} function test_current { @@ -18,16 +61,32 @@ function test_current scalar=$2 name=$3 - if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then - res=`./$name` - echo $res - echo "$rev $res" >> $name.out + prev=`grep $rev "$name.backup" | cut -c 14-` + res=$prev + count_rev=`echo $prev | wc -w` + count_ref=`cat "settings.txt" | wc -l` + if [ $update == true ] || [ $count_rev != $count_ref ]; then + if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then + curr=`./$name` + echo merge $prev + echo with $curr + res=`merge "$curr" "$prev"` + echo $res + echo "$rev $res" >> $name.out + else + echo "Compilation failed, skip rev $rev" + fi else - echo "Compilation failed, skip rev $rev" + echo "Skip existing results for $rev / $name" + echo "$rev $res" >> $name.out fi } -while read rev +make_backup $PREFIX"sgemm" +make_backup $PREFIX"dgemm" +make_backup $PREFIX"cgemm" + +cut -f1 -d"#" < changesets.txt | while read rev do if [ ! -z '$rev' ]; then echo "Testing rev $rev" @@ -36,27 +95,27 @@ do actual_rev=`hg identify | cut -f1 -d' '` cd .. - test_current $actual_rev float sgemm - test_current $actual_rev double dgemm - test_current $actual_rev "std::complex" cgemm + test_current $actual_rev float $PREFIX"sgemm" + test_current $actual_rev double $PREFIX"dgemm" + test_current $actual_rev "std::complex" $PREFIX"cgemm" fi -done < changesets.txt +done echo "Float:" -cat sgemm.out +cat $PREFIX"sgemm.out" echo "" echo "Double:" -cat dgemm.out +cat $PREFIX"dgemm.out" echo "" echo "Complex:" -cat cgemm.out +cat $PREFIX"cgemm.out" echo "" -./make_plot.sh sgemm -./make_plot.sh dgemm -./make_plot.sh cgemm +./make_plot.sh $PREFIX"sgemm" +./make_plot.sh $PREFIX"dgemm" +./make_plot.sh $PREFIX"cgemm" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index e22dd4de0..b2b28826a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -555,6 +555,11 @@ class TensorBase : public TensorBase(derived(), offset, dim); } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp shuffle(const Shuffle& shuffle) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 5790e19d6..055a7d407 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -249,7 +249,7 @@ struct TensorEvaluator, Device> innermostLoc = index; } else { if (internal::index_statically_eq()(0, 1)) { - eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0); + eigen_assert(index % m_impl.dimensions()[0] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[0]; @@ -302,7 +302,7 @@ struct TensorEvaluator, Device> innermostLoc = index; } else { if (internal::index_statically_eq()(NumDims-1, 1)) { - eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0); + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); innermostLoc = 0; } else { innermostLoc = index % m_impl.dimensions()[NumDims-1]; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 8b87f1045..9259c864e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -174,8 +174,6 @@ struct TensorEvaluatorm_device.numThreads(); Index mc = m; @@ -190,8 +188,8 @@ struct TensorEvaluator kernel_promises(num_kernel_promises); std::vector kernel_futures(num_kernel_promises); - for (int i = 0; i < kernel_promises.size(); ++i) { + for (std::size_t i = 0; i < kernel_promises.size(); ++i) { kernel_promises[i].set_value(); kernel_futures[i] = kernel_promises[i].get_future(); } @@ -239,16 +237,16 @@ struct TensorEvaluator 0); - int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; for (int i = 0; i < n_blocks; ++i) { - int future_id = (blockAId * n_blocks + i); + Index future_id = (blockAId * n_blocks + i); wait_until_ready(&kernel_futures[future_id]); kernel_promises[future_id] = Promise(); kernel_futures[future_id] = kernel_promises[future_id].get_future(); @@ -277,9 +275,9 @@ struct TensorEvaluator { } EIGEN_DEVICE_FUNC DSizes() { - for (int i = 0 ; i < NumDims; ++i) { + for (std::size_t i = 0 ; i < NumDims; ++i) { (*this)[i] = 0; } } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 05ac9bd2f..bb2f8b977 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -97,7 +97,7 @@ struct EvalRange { Index i = first; static const int PacketSize = unpacket_traits::size; - if (last - first > PacketSize) { + if (last - first >= PacketSize) { eigen_assert(first % PacketSize == 0); Index lastPacket = last - (last % PacketSize); for (; i < lastPacket; i += PacketSize) { @@ -131,7 +131,6 @@ class TensorExecutor const Index blocksize = std::max(PacketSize, (blocksz - (blocksz % PacketSize))); const Index numblocks = size / blocksize; - Index i = 0; std::vector results; results.reserve(numblocks); for (int i = 0; i < numblocks; ++i) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 2714117ab..11c7ce443 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -28,6 +28,23 @@ namespace Eigen { namespace internal { +namespace { + // Note: result is undefined if val == 0 + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val) + { +#ifdef __CUDA_ARCH__ + return __clz(val); +#elif EIGEN_COMP_MSVC + DWORD leading_zero = 0; + _BitScanReverse( &leading_zero, value); + return 31 - leading_zero; +#else + return __builtin_clz(static_cast(val)); +#endif + } +} + template struct TensorIntDivisor { public: @@ -44,11 +61,7 @@ struct TensorIntDivisor { eigen_assert(divider <= (1<<(N-1)) - 1); // fast ln2 -#ifndef __CUDA_ARCH__ - const int leading_zeros = __builtin_clz(divider); -#else - const int leading_zeros = __clz(divider); -#endif + const int leading_zeros = count_leading_zeros(divider); const int log_div = N - (leading_zeros+1); multiplier = (static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index c119b30e2..054ecf7b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -85,6 +85,15 @@ class TensorLayoutSwapOp : public TensorBase, WriteA const typename internal::remove_all::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index a93f48ccb..01ba0a80f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -302,7 +302,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (int i = 0; i < internal::array_size::value; ++i) { + for (std::size_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index ad21e966b..16bef2ad3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -49,12 +49,9 @@ struct nested, 1, } // end namespace internal - - - template class TensorReverseOp : public TensorBase, ReadOnlyAccessors> + XprType>, WriteAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -67,8 +64,8 @@ class TensorReverseOp : public TensorBase::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, - const ReverseDimensions& reverse_dims) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( + const XprType& expr, const ReverseDimensions& reverse_dims) : m_xpr(expr), m_reverse_dims(reverse_dims) {} EIGEN_DEVICE_FUNC @@ -78,12 +75,30 @@ class TensorReverseOp : public TensorBase::type& expression() const { return m_xpr; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + protected: typename XprType::Nested m_xpr; const ReverseDimensions m_reverse_dims; }; - // Eval as rvalue template struct TensorEvaluator, Device> @@ -134,8 +149,8 @@ struct TensorEvaluator, Device m_impl.cleanup(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( + Index index) const { eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -152,7 +167,6 @@ struct TensorEvaluator, Device } else { inputIndex += index; } - return m_impl.coeff(inputIndex); } else { for (int i = 0; i < NumDims - 1; ++i) { Index idx = index / m_strides[i]; @@ -167,8 +181,13 @@ struct TensorEvaluator, Device } else { inputIndex += index; } - return m_impl.coeff(inputIndex); } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( + Index index) const { + return m_impl.coeff(reverseIndex(index)); } template @@ -199,9 +218,57 @@ struct TensorEvaluator, Device ReverseDimensions m_reverse; }; +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, + Device> { + typedef TensorEvaluator, + Device> Base; + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + }; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, + const Device& device) + : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename XprType::PacketReturnType PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_impl.coeffRef(this->reverseIndex(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) { + const int packetSize = internal::unpacket_traits::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + +}; - -} // end namespace Eigen +} // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index 4c0be35da..f96c21fa3 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -94,7 +94,7 @@ static void test_simple_reverse() template -static void test_expr_reverse() +static void test_expr_reverse(bool LValue) { Tensor tensor(2,3,5,7); tensor.setRandom(); @@ -105,9 +105,12 @@ static void test_expr_reverse() dim_rev[2] = false; dim_rev[3] = true; - - Tensor expected; - expected = tensor.reverse(dim_rev); + Tensor expected(2, 3, 5, 7); + if (LValue) { + expected.reverse(dim_rev) = tensor; + } else { + expected = tensor.reverse(dim_rev); + } Tensor result(2,3,5,7); @@ -117,8 +120,13 @@ static void test_expr_reverse() array dst_slice_start{{0,0,0,0}}; for (int i = 0; i < 5; ++i) { - result.slice(dst_slice_start, dst_slice_dim) = - tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + if (LValue) { + result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) = + tensor.slice(src_slice_start, src_slice_dim); + } else { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } src_slice_start[2] += 1; dst_slice_start[2] += 1; } @@ -141,8 +149,13 @@ static void test_expr_reverse() dst_slice_start[2] = 0; result.setRandom(); for (int i = 0; i < 5; ++i) { - result.slice(dst_slice_start, dst_slice_dim) = - tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + if (LValue) { + result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) = + tensor.slice(dst_slice_start, dst_slice_dim); + } else { + result.slice(dst_slice_start, dst_slice_dim) = + tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } dst_slice_start[2] += 1; } @@ -162,6 +175,8 @@ void test_cxx11_tensor_reverse() { CALL_SUBTEST(test_simple_reverse()); CALL_SUBTEST(test_simple_reverse()); - CALL_SUBTEST(test_expr_reverse()); - CALL_SUBTEST(test_expr_reverse()); + CALL_SUBTEST(test_expr_reverse(true)); + CALL_SUBTEST(test_expr_reverse(true)); + CALL_SUBTEST(test_expr_reverse(false)); + CALL_SUBTEST(test_expr_reverse(false)); }