mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
Added support for 128bit integers on CUDA devices.
This commit is contained in:
parent
7d1cedd0fe
commit
f8df393165
@ -67,6 +67,7 @@
|
||||
#include "src/Tensor/TensorInitializer.h"
|
||||
#include "src/Tensor/TensorTraits.h"
|
||||
#include "src/Tensor/TensorFunctors.h"
|
||||
#include "src/Tensor/TensorUInt128.h"
|
||||
#include "src/Tensor/TensorIntDiv.h"
|
||||
|
||||
#include "src/Tensor/TensorBase.h"
|
||||
|
233
unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
Normal file
233
unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
Normal file
@ -0,0 +1,233 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
|
||||
template <uint64_t n>
|
||||
struct static_val {
|
||||
static const uint64_t value = n;
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
|
||||
eigen_assert(v == n);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename HIGH = uint64_t, typename LOW = uint64_t>
|
||||
struct TensorUInt128
|
||||
{
|
||||
HIGH high;
|
||||
LOW low;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(int x) : high(0), low(x) {
|
||||
eigen_assert(x >= 0);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(int64_t x) : high(0), low(x) {
|
||||
eigen_assert(x >= 0);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(uint64_t x) : high(0), low(x) { }
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
|
||||
return low;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
|
||||
return low;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
|
||||
return high;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (lhs.high != rhs.high) {
|
||||
return lhs.high > rhs.high;
|
||||
}
|
||||
return lhs.low >= rhs.low;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (lhs.high != rhs.high) {
|
||||
return lhs.high < rhs.high;
|
||||
}
|
||||
return lhs.low < rhs.low;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
|
||||
if (result.low < rhs.low) {
|
||||
result.high += 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
|
||||
if (result.low > lhs.low) {
|
||||
result.high -= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
// Split each 128-bit integer into 4 32-bit integers, and then do the
|
||||
// multiplications by hand as follow:
|
||||
// lhs a b c d
|
||||
// rhs e f g h
|
||||
// -----------
|
||||
// ah bh ch dh
|
||||
// bg cg dg
|
||||
// cf df
|
||||
// de
|
||||
// The result is stored in 2 64bit integers, high and low.
|
||||
|
||||
static const uint64_t LOW = 0x00000000FFFFFFFFLL;
|
||||
static const uint64_t HIGH = 0xFFFFFFFF00000000LL;
|
||||
|
||||
uint64_t d = lhs.low & LOW;
|
||||
uint64_t c = (lhs.low & HIGH) >> 32LL;
|
||||
uint64_t b = lhs.high & LOW;
|
||||
uint64_t a = (lhs.high & HIGH) >> 32LL;
|
||||
|
||||
uint64_t h = rhs.low & LOW;
|
||||
uint64_t g = (rhs.low & HIGH) >> 32LL;
|
||||
uint64_t f = rhs.high & LOW;
|
||||
uint64_t e = (rhs.high & HIGH) >> 32LL;
|
||||
|
||||
// Compute the low 32 bits of low
|
||||
uint64_t acc = d * h;
|
||||
uint64_t low = acc & LOW;
|
||||
// Compute the high 32 bits of low. Add a carry every time we wrap around
|
||||
acc >>= 32LL;
|
||||
uint64_t carry = 0;
|
||||
uint64_t acc2 = acc + c * h;
|
||||
if (acc2 < acc) {
|
||||
carry++;
|
||||
}
|
||||
acc = acc2 + d * g;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
low |= (acc << 32LL);
|
||||
|
||||
// Carry forward the high bits of acc to initiate the computation of the
|
||||
// low 32 bits of high
|
||||
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||
carry = 0;
|
||||
|
||||
acc = acc2 + b * h;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
acc2 = acc + c * g;
|
||||
if (acc2 < acc) {
|
||||
carry++;
|
||||
}
|
||||
acc = acc2 + d * f;
|
||||
if (acc < acc2) {
|
||||
carry++;
|
||||
}
|
||||
uint64_t high = acc & LOW;
|
||||
|
||||
// Start to compute the high 32 bits of high.
|
||||
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||
|
||||
acc = acc2 + a * h;
|
||||
acc2 = acc + b * g;
|
||||
acc = acc2 + c * f;
|
||||
acc2 = acc + d * e;
|
||||
high |= (acc2 << 32LL);
|
||||
|
||||
return TensorUInt128<uint64_t, uint64_t>(high, low);
|
||||
}
|
||||
|
||||
template <typename HL, typename LL, typename HR, typename LR>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||
{
|
||||
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
|
||||
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||
} else if (lhs < rhs) {
|
||||
return TensorUInt128<uint64_t, uint64_t>(0);
|
||||
} else {
|
||||
// calculate the biggest power of 2 times rhs that's less than or equal to lhs
|
||||
TensorUInt128<uint64_t, uint64_t> power2(1);
|
||||
TensorUInt128<uint64_t, uint64_t> d(rhs);
|
||||
TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
|
||||
while (lhs >= d) {
|
||||
tmp = tmp - d;
|
||||
d = d + d;
|
||||
power2 = power2 + power2;
|
||||
}
|
||||
|
||||
tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||
TensorUInt128<uint64_t, uint64_t> result(0);
|
||||
while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
|
||||
if (tmp >= d) {
|
||||
tmp = tmp - d;
|
||||
result = result + power2;
|
||||
}
|
||||
// Shift right
|
||||
power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
|
||||
d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
@ -117,6 +117,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_uint128 "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
|
||||
ei_add_test(cxx11_tensor_map "-std=c++0x")
|
||||
|
144
unsupported/test/cxx11_tensor_uint128.cpp
Normal file
144
unsupported/test/cxx11_tensor_uint128.cpp
Normal file
@ -0,0 +1,144 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#include "main.h"
|
||||
|
||||
#include <Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::internal::TensorUInt128;
|
||||
using Eigen::internal::static_val;
|
||||
|
||||
static void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, __uint128_t expected) {
|
||||
bool matchl = actual.lower() == static_cast<uint64_t>(expected);
|
||||
bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
|
||||
if (!matchl || !matchh) {
|
||||
const char* testname = g_test_stack.back().c_str();
|
||||
std::cerr << "Test " << testname << " failed in " << __FILE__
|
||||
<< " (" << __LINE__ << ")"
|
||||
<< std::endl;
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_add() {
|
||||
uint64_t incr = internal::random<uint64_t>(1, 9999999999);
|
||||
for (uint64_t i1 = 0; i1 < 100; ++i1) {
|
||||
for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> i(i1, i2);
|
||||
__uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
|
||||
for (uint64_t j1 = 0; j1 < 100; ++j1) {
|
||||
for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> j(j1, j2);
|
||||
__uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
|
||||
TensorUInt128<uint64_t, uint64_t> actual = i + j;
|
||||
__uint128_t expected = a + b;
|
||||
VERIFY_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_sub() {
|
||||
uint64_t incr = internal::random<uint64_t>(1, 9999999999);
|
||||
for (uint64_t i1 = 0; i1 < 100; ++i1) {
|
||||
for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> i(i1, i2);
|
||||
__uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
|
||||
for (uint64_t j1 = 0; j1 < 100; ++j1) {
|
||||
for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> j(j1, j2);
|
||||
__uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
|
||||
TensorUInt128<uint64_t, uint64_t> actual = i - j;
|
||||
__uint128_t expected = a - b;
|
||||
VERIFY_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_mul() {
|
||||
uint64_t incr = internal::random<uint64_t>(1, 9999999999);
|
||||
for (uint64_t i1 = 0; i1 < 100; ++i1) {
|
||||
for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> i(i1, i2);
|
||||
__uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
|
||||
for (uint64_t j1 = 0; j1 < 100; ++j1) {
|
||||
for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> j(j1, j2);
|
||||
__uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
|
||||
TensorUInt128<uint64_t, uint64_t> actual = i * j;
|
||||
__uint128_t expected = a * b;
|
||||
VERIFY_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_div() {
|
||||
uint64_t incr = internal::random<uint64_t>(1, 9999999999);
|
||||
for (uint64_t i1 = 0; i1 < 100; ++i1) {
|
||||
for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> i(i1, i2);
|
||||
__uint128_t a = (static_cast<__uint128_t>(i1) << 64) + static_cast<__uint128_t>(i2);
|
||||
for (uint64_t j1 = 0; j1 < 100; ++j1) {
|
||||
for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
|
||||
TensorUInt128<uint64_t, uint64_t> j(j1, j2);
|
||||
__uint128_t b = (static_cast<__uint128_t>(j1) << 64) + static_cast<__uint128_t>(j2);
|
||||
TensorUInt128<uint64_t, uint64_t> actual = i / j;
|
||||
__uint128_t expected = a / b;
|
||||
VERIFY_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_misc1() {
|
||||
uint64_t incr = internal::random<uint64_t>(1, 9999999999);
|
||||
for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
|
||||
TensorUInt128<static_val<0>, uint64_t> i(0, i2);
|
||||
__uint128_t a = static_cast<__uint128_t>(i2);
|
||||
for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
|
||||
TensorUInt128<static_val<0>, uint64_t> j(0, j2);
|
||||
__uint128_t b = static_cast<__uint128_t>(j2);
|
||||
uint64_t actual = (i * j).upper();
|
||||
uint64_t expected = (a * b) >> 64;
|
||||
VERIFY_IS_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_misc2() {
|
||||
int64_t incr = internal::random<int64_t>(1, 100);
|
||||
for (int64_t log_div = 0; log_div < 63; ++log_div) {
|
||||
for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
|
||||
uint64_t expected = (static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1;
|
||||
uint64_t shift = 1ULL << log_div;
|
||||
|
||||
TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
|
||||
uint64_t actual = static_cast<uint64_t>(result);
|
||||
VERIFY_EQUAL(actual, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_cxx11_tensor_uint128()
|
||||
{
|
||||
CALL_SUBTEST(test_add());
|
||||
CALL_SUBTEST(test_sub());
|
||||
CALL_SUBTEST(test_mul());
|
||||
CALL_SUBTEST(test_div());
|
||||
CALL_SUBTEST(test_misc1());
|
||||
CALL_SUBTEST(test_misc2());
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user