Added new version of the TensorIntDiv class optimized for 32 bit signed integers. It saves 1 register on CPU and 2 on GPU.

2025-08-12 11:49:02 +08:00 · 2015-05-19 13:59:52 -07:00 · 2015-05-19 13:59:52 -07:00 · a81d17b73a
commit a81d17b73a
parent 051d5325cc
2 changed files with 64 additions and 2 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@ -87,6 +87,68 @@ struct TensorIntDivisor {
 };
 // Optimized version for signed 32 bit integers.
 // Derived from Hacker's Delight.
 template <>
 class TensorIntDivisor<int> {
 public:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
    magic = 0;
    shift = 0;
  }
  // Must have 2 <= divider
  EIGEN_DEVICE_FUNC TensorIntDivisor(int divider)  {
    eigen_assert(divider >= 2);
    calcMagic(divider);
  }
  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int n) const {
 #ifdef __CUDA_ARCH__
    return (__umulhi(magic, n) >> shift);
 #else
  uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
  return (static_cast<unsigned int>(v >> 32) >> shift);
 #endif
  }
 private:
  // Compute the magic numbers. See Hacker's Delight section 10 for an in
  // depth explanation.
  EIGEN_DEVICE_FUNC void calcMagic(int d) {
   const unsigned two31 = 0x80000000;     // 2**31.
   unsigned ad = d;
   unsigned t = two31 + (ad >> 31);
   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
   int p = 31;                      // Init. p.
   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
   unsigned delta = 0;
   do {
      p = p + 1;
      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
      if (r1 >= anc) {     // (Must be an unsigned
         q1 = q1 + 1;      // comparison here).
         r1 = r1 - anc;}
      q2 = 2*q2;           // Update q2 = 2**p/|d|.
      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
      if (r2 >= ad) {      // (Must be an unsigned
         q2 = q2 + 1;      // comparison here).
         r2 = r2 - ad;}
      delta = ad - r2;
   } while (q1 < delta || (q1 == delta && r1 == 0));
   magic = (unsigned)(q2 + 1);
   shift = p - 32;
  }
  unsigned int magic;
  int shift;
 };
 template <typename T>
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
  return divisor.divide(numerator);
--- a/unsupported/test/cxx11_tensor_intdiv.cpp
+++ b/unsupported/test/cxx11_tensor_intdiv.cpp
@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -14,7 +14,7 @@
 void test_signed_32bit()
 {
-  for (int32_t i = 1; i < 25000; ++i) {
+  for (int32_t i = 2; i < 25000; ++i) {
    const Eigen::internal::TensorIntDivisor<int32_t> div(i);
    for (int32_t j = 0; j < 25000; ++j) {