From 50df8d3d6d7595dc062ee18868978d466c3c43fe Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 18 Oct 2021 20:58:14 +1100 Subject: [PATCH] Avoid integer overflow in EigenMetaKernel indexing - The current implementation computes `size + total_threads`, which can overflow and cause CUDA_ERROR_ILLEGAL_ADDRESS when size is close to the maximum representable value. - The num_blocks calculation can also overflow due to the implementation of divup(). - This patch prevents these overflows and allows the kernel to work correctly for the full representable range of tensor sizes. - Also adds relevant tests. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 65 +++++++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 6 +- unsupported/test/cxx11_tensor_gpu.cu | 45 +++++++++++++ 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 279be346e..9a513dc8a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -553,11 +553,59 @@ class TensorExecutor { }; #if defined(EIGEN_GPUCC) +// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, + Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + if (lhs > 0 && rhs > 0) { + return lhs > highest - rhs ? 1 : 0; + } else if (lhs < 0 && rhs < 0) { + return lhs < lowest - rhs ? -1 : 0; + } else { + return 0; + } +} + +// Returns lhs + rhs, saturating to the highest/lowest representable value on +// overflow/underflow respectively. +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) { + const Index highest = NumTraits::highest(); + const Index lowest = NumTraits::lowest(); + int overflow = sum_will_overflow(lhs, rhs); + return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs; +} + +// A functor that adds step_size to a given index, saturating to avoid +// overflow/underflow. If overflow/underflow is not possible, regular addition +// is used (for efficiency). +template +struct SafeStep { + // lastIdx is one past the end of the possible indexes. + // step_size is the value that will be added to the given index when the + // functor is called. + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size) + : can_overflow_(sum_will_overflow(lastIdx, step_size)), + step_size_(step_size) {} + + // Adds step_size to index, saturating on overflow (if overflow is possible). + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const { + return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_; + } + + private: + const bool can_overflow_; + const Index step_size_; +}; + template struct EigenMetaKernelEval { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) { eval.evalScalar(i); } } @@ -571,12 +619,16 @@ struct EigenMetaKernelEval { const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; const StorageIndex vectorized_step_size = step_size * PacketSize; + SafeStep safe_vectorized_step(vectorized_size, + vectorized_step_size); // Use the vector path for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; - i += vectorized_step_size) { + i = safe_vectorized_step(i)) { eval.evalPacket(i); } - for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { + SafeStep safe_step(lastIdx, step_size); + for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; + i = safe_step(i)) { eval.evalScalar(i); } } @@ -603,8 +655,11 @@ EIGEN_STRONG_INLINE void TensorExecutor(device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor(), + NumTraits::highest()) / + block_size; const StorageIndex size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b3f4a1ce9..cf891ebd3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -30,13 +30,15 @@ const T2& choose(Cond, const T1&, const T2& second) { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y) { - return static_cast((x + y - 1) / y); + // Note: This form is used because it cannot overflow. + return static_cast(x == 0 ? 0 : (x - 1) / y + 1); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const T x, const T y) { - return static_cast((x + y - 1) / y); + // Note: This form is used because it cannot overflow. + return static_cast(x == 0 ? 0 : (x - 1) / y + 1); } template struct max_n_1 { diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 0a37c0293..83b150d9c 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -66,6 +66,47 @@ void test_gpu_nullary() { gpuFree(d_in2); } +// Tests that there are no indexing overflows when computing tensors with the +// max representable size. +template ::max)()> +void test_gpu_nullary_max_size() +{ + typedef int8_t DataType; + typedef Tensor TensorType; + typedef Eigen::array ArrayType; + + const IndexType n = N; + TensorType in1((ArrayType(n))); + in1.setZero(); + + std::size_t in1_bytes = in1.size() * sizeof(DataType); + + DataType* d_in1; + gpuMalloc((void**)(&d_in1), in1_bytes); + + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap gpu_in1(d_in1, ArrayType(n)); + + gpu_in1.device(gpu_device) = gpu_in1.constant(123); + + TensorType new1((ArrayType(n))); + + assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (IndexType i = 0; i < n; ++i) { + VERIFY_IS_EQUAL(new1(ArrayType(i)), 123); + } + + gpuFree(d_in1); +} + void test_gpu_elementwise_small() { Tensor in1(Eigen::array(2)); Tensor in2(Eigen::array(2)); @@ -1524,6 +1565,10 @@ void test_gpu_gamma_sample_der_alpha() EIGEN_DECLARE_TEST(cxx11_tensor_gpu) { CALL_SUBTEST_1(test_gpu_nullary()); + CALL_SUBTEST_1(test_gpu_nullary_max_size()); + CALL_SUBTEST_1(test_gpu_nullary_max_size()); + CALL_SUBTEST_1((test_gpu_nullary_max_size< + int64_t, (std::numeric_limits::max)() + 100ll>())); CALL_SUBTEST_1(test_gpu_elementwise_small()); CALL_SUBTEST_1(test_gpu_elementwise()); CALL_SUBTEST_1(test_gpu_props());