From bf66137efc6bdeebb1085e9a4eeea0eb2cd286c1 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 26 Aug 2021 14:48:09 -0700 Subject: [PATCH] New GPU test utilities. This introduces new functions: ``` // returns kernel(args...) running on the CPU. Eigen::run_on_cpu(Kernel kernel, Args&&... args); // returns kernel(args...) running on the GPU. Eigen::run_on_gpu(Kernel kernel, Args&&... args); Eigen::run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args); // returns kernel(args...) running on the GPU if using // a GPU compiler, or CPU otherwise. Eigen::run(Kernel kernel, Args&&... args); Eigen::run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args); ``` Running on the GPU is accomplished by: - Serializing the kernel inputs on the CPU - Transferring the inputs to the GPU - Passing the kernel and serialized inputs to a GPU kernel - Deserializing the inputs on the GPU - Running `kernel(inputs...)` on the GPU - Serializing all output parameters and the return value - Transferring the serialized outputs back to the CPU - Deserializing the outputs and return value on the CPU - Returning the deserialized return value All inputs must be serializable (currently POD types, `Eigen::Matrix` and `Eigen::Array`). The kernel must also be POD (though usually contains no actual data). Tested on CUDA 9.1, 10.2, 11.3, with g++-6, g++-8, g++-10 respectively. This MR depends on !622, !623, !624. --- test/CMakeLists.txt | 2 + test/gpu_example.cu | 130 ++++++++++++ test/gpu_test_helper.h | 464 +++++++++++++++++++++++++++++++++++++++++ test/main.h | 31 ++- 4 files changed, 616 insertions(+), 11 deletions(-) create mode 100644 test/gpu_example.cu create mode 100644 test/gpu_test_helper.h diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 675742c04..5c415b400 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -412,6 +412,7 @@ if(CUDA_FOUND) string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(gpu_example) ei_add_test(gpu_basic) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) @@ -442,6 +443,7 @@ if (EIGEN_TEST_HIP) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) + ei_add_test(gpu_example) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) diff --git a/test/gpu_example.cu b/test/gpu_example.cu new file mode 100644 index 000000000..554fc9b7c --- /dev/null +++ b/test/gpu_example.cu @@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 The Eigen Team. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// The following is an example GPU test. + +#include "main.h" // Include the main test utilities. + +// Define a kernel functor. +// +// The kernel must be a POD type and implement operator(). +struct AddKernel { + // Parameters must be POD or serializable Eigen types (e.g. Matrix, + // Array). The return value must be a POD or serializable value type. + template + EIGEN_DEVICE_FUNC + Type3 operator()(const Type1& A, const Type2& B, Type3& C) const { + C = A + B; // Populate output parameter. + Type3 D = A + B; // Populate return value. + return D; + } +}; + +// Define a sub-test that uses the kernel. +template +void test_add(const T& type) { + const Index rows = type.rows(); + const Index cols = type.cols(); + + // Create random inputs. + const T A = T::Random(rows, cols); + const T B = T::Random(rows, cols); + T C; // Output parameter. + + // Create kernel. + AddKernel add_kernel; + + // Run add_kernel(A, B, C) via run(...). + // This will run on the GPU if using a GPU compiler, or CPU otherwise, + // facilitating generic tests that can run on either. + T D = run(add_kernel, A, B, C); + + // Check that both output parameter and return value are correctly populated. + const T expected = A + B; + VERIFY_IS_CWISE_EQUAL(C, expected); + VERIFY_IS_CWISE_EQUAL(D, expected); + + // In a GPU-only test, we can verify that the CPU and GPU produce the + // same results. + T C_cpu, C_gpu; + T D_cpu = run_on_cpu(add_kernel, A, B, C_cpu); // Runs on CPU. + T D_gpu = run_on_gpu(add_kernel, A, B, C_gpu); // Runs on GPU. + VERIFY_IS_CWISE_EQUAL(C_cpu, C_gpu); + VERIFY_IS_CWISE_EQUAL(D_cpu, D_gpu); +}; + +struct MultiplyKernel { + template + EIGEN_DEVICE_FUNC + Type3 operator()(const Type1& A, const Type2& B, Type3& C) const { + C = A * B; + return A * B; + } +}; + +template +void test_multiply(const T1& type1, const T2& type2, const T3& type3) { + + const T1 A = T1::Random(type1.rows(), type1.cols()); + const T2 B = T2::Random(type2.rows(), type2.cols()); + T3 C; + + MultiplyKernel multiply_kernel; + + // The run(...) family of functions uses a memory buffer to transfer data back + // and forth to and from the device. The size of this buffer is estimated + // from the size of all input parameters. If the estimated buffer size is + // not sufficient for transferring outputs from device-to-host, then an + // explicit buffer size needs to be specified. + + // 2 outputs of size (A * B). For each matrix output, the buffer will store + // the number of rows, columns, and the data. + size_t buffer_capacity_hint = 2 * ( // 2 output parameters + 2 * sizeof(typename T3::Index) // # Rows, # Cols + + A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data + + T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C); + + const T3 expected = A * B; + VERIFY_IS_CWISE_APPROX(C, expected); + VERIFY_IS_CWISE_APPROX(D, expected); + + T3 C_cpu, C_gpu; + T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu); + T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint, + multiply_kernel, A, B, C_gpu); + VERIFY_IS_CWISE_APPROX(C_cpu, C_gpu); + VERIFY_IS_CWISE_APPROX(D_cpu, D_gpu); +} + +// Declare the test fixture. +EIGEN_DECLARE_TEST(gpu_example) +{ + // For the number of repeats, call the desired subtests. + for(int i = 0; i < g_repeat; i++) { + // Call subtests with different sized/typed inputs. + CALL_SUBTEST( test_add(Eigen::Vector3f()) ); + CALL_SUBTEST( test_add(Eigen::Matrix3d()) ); + CALL_SUBTEST( test_add(Eigen::MatrixX(10, 10)) ); + + CALL_SUBTEST( test_add(Eigen::Array44f()) ); + CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) ); + CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) ); + + CALL_SUBTEST( test_multiply(Eigen::Matrix3d(), + Eigen::Matrix3d(), + Eigen::Matrix3d()) ); + CALL_SUBTEST( test_multiply(Eigen::MatrixX(10, 10), + Eigen::MatrixX(10, 10), + Eigen::MatrixX()) ); + CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1), + Eigen::MatrixXf(1, 32), + Eigen::MatrixXf()) ); + } +} diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h new file mode 100644 index 000000000..552166613 --- /dev/null +++ b/test/gpu_test_helper.h @@ -0,0 +1,464 @@ +#ifndef GPU_TEST_HELPER_H +#define GPU_TEST_HELPER_H + +#include + +#ifdef EIGEN_GPUCC +#define EIGEN_USE_GPU +#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h" +#endif // EIGEN_GPUCC + +// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that +// doesn't allow std::tuple to compile for host code either. In these cases, +// use our custom implementation. +#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) +#define EIGEN_USE_CUSTOM_TUPLE 1 +#else +#define EIGEN_USE_CUSTOM_TUPLE 0 +#endif + +#if EIGEN_USE_CUSTOM_TUPLE +#include "../Eigen/src/Core/arch/GPU/Tuple.h" +#else +#include +#endif +namespace Eigen { + +namespace internal { + +namespace tuple_impl { + +// Use std::tuple on CPU, otherwise use the GPU-specific versions. +#if !EIGEN_USE_CUSTOM_TUPLE +using std::tuple; +using std::get; +using std::make_tuple; +using std::tie; +#endif +#undef EIGEN_USE_CUSTOM_TUPLE + +} + +template +struct extract_output_indices_helper; + +/** + * Extracts a set of indices corresponding to non-const l-value reference + * output types. + * + * \internal + * \tparam N the number of types {T1, Ts...}. + * \tparam Idx the "index" to append if T1 is an output type. + * \tparam OutputIndices the current set of output indices. + * \tparam T1 the next type to consider, with index Idx. + * \tparam Ts the remaining types. + */ +template +struct extract_output_indices_helper, T1, Ts...> { + using type = typename + extract_output_indices_helper< + N - 1, Idx + 1, + typename std::conditional< + // If is a non-const l-value reference, append index. + std::is_lvalue_reference::value + && !std::is_const::type>::value, + index_sequence, + index_sequence >::type, + Ts...>::type; +}; + +// Base case. +template +struct extract_output_indices_helper<0, Idx, index_sequence > { + using type = index_sequence; +}; + +// Extracts a set of indices into Types... that correspond to non-const +// l-value references. +template +using extract_output_indices = typename extract_output_indices_helper, Types...>::type; + +// Helper struct for dealing with Generic functors that may return void. +struct void_helper { + struct Void {}; + + // Converts void -> Void, T otherwise. + template + using ReturnType = typename std::conditional::value, Void, T>::type; + + // Non-void return value. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + auto call(Func&& func, Args&&... args) -> + typename std::enable_if::value, + decltype(func(args...))>::type { + return func(std::forward(args)...); + } + + // Void return value. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + auto call(Func&& func, Args&&... args) -> + typename std::enable_if::value, + Void>::type { + func(std::forward(args)...); + return Void{}; + } + + // Restores the original return type, Void -> void, T otherwise. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + typename std::enable_if::type, Void>::value, T>::type + restore(T&& val) { + return val; + } + + // Void case. + template + static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC + void restore(const Void&) {} +}; + +// Runs a kernel via serialized buffer. Does this by deserializing the buffer +// to construct the arguments, calling the kernel, then re-serialing the outputs. +// The buffer contains +// [ input_buffer_size, args ] +// After the kernel call, it is then populated with +// [ output_buffer_size, output_parameters, return_value ] +// If the output_buffer_size exceeds the buffer's capacity, then only the +// output_buffer_size is populated. +template +EIGEN_DEVICE_FUNC +void run_serialized(index_sequence, index_sequence, + Kernel kernel, uint8_t* buffer, size_t capacity) { + using Eigen::internal::tuple_impl::get; + using Eigen::internal::tuple_impl::make_tuple; + using Eigen::internal::tuple_impl::tuple; + // Deserialize input size and inputs. + size_t input_size; + uint8_t* buff_ptr = Eigen::deserialize(buffer, input_size); + // Create value-type instances to populate. + auto args = make_tuple(typename std::decay::type{}...); + EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning. + // NVCC 9.1 requires us to spell out the template parameters explicitly. + buff_ptr = Eigen::deserialize(buff_ptr, get::type...>(args)...); + + // Call function, with void->Void conversion so we are guaranteed a complete + // output type. + auto result = void_helper::call(kernel, get::type...>(args)...); + + // Determine required output size. + size_t output_size = Eigen::serialize_size(capacity); + output_size += Eigen::serialize_size(get::type...>(args)...); + output_size += Eigen::serialize_size(result); + + // Always serialize required buffer size. + buff_ptr = Eigen::serialize(buffer, output_size); + // Serialize outputs if they fit in the buffer. + if (output_size <= capacity) { + // Collect outputs and result. + buff_ptr = Eigen::serialize(buff_ptr, get::type...>(args)...); + buff_ptr = Eigen::serialize(buff_ptr, result); + } +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) { + run_serialized (make_index_sequence{}, + extract_output_indices{}, + kernel, buffer, capacity); +} + +#ifdef EIGEN_GPUCC + +// Checks for GPU errors and asserts / prints the error message. +#define GPU_CHECK(expr) \ +do { \ + gpuError_t err = expr; \ + if (err != gpuSuccess) { \ + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \ + gpu_assert(false); \ + } \ +} while(0) + +// Calls run_serialized on the GPU. +template +__global__ +EIGEN_HIP_LAUNCH_BOUNDS_1024 +void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) { + run_serialized(kernel, buffer, capacity); +} + +// Runs kernel(args...) on the GPU via the serialization mechanism. +// +// Note: this may end up calling the kernel multiple times if the initial output +// buffer is not large enough to hold the outputs. +template +auto run_serialized_on_gpu(size_t buffer_capacity_hint, + index_sequence, + index_sequence, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { + // Compute the required serialization buffer capacity. + // Round up input size to next power of two to give a little extra room + // for outputs. + size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...); + + size_t capacity; + if (buffer_capacity_hint == 0) { + // Estimate as the power of two larger than the total input size. + capacity = sizeof(size_t); + while (capacity <= input_data_size) { + capacity *= 2; + } + } else { + // Use the larger of the hint and the total input size. + // Add sizeof(size_t) to the hint to account for storing the buffer capacity + // itself so the user doesn't need to think about this. + capacity = std::max(buffer_capacity_hint + sizeof(size_t), + input_data_size); + } + std::vector buffer(capacity); + + uint8_t* host_data = nullptr; + uint8_t* host_ptr = nullptr; + uint8_t* device_data = nullptr; + size_t output_data_size = 0; + + // Allocate buffers and copy input data. + capacity = std::max(capacity, output_data_size); + buffer.resize(capacity); + host_data = buffer.data(); + host_ptr = Eigen::serialize(host_data, input_data_size); + host_ptr = Eigen::serialize(host_ptr, args...); + + // Copy inputs to host. + gpuFree(device_data); + gpuMalloc((void**)(&device_data), capacity); + gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice); + GPU_CHECK(gpuDeviceSynchronize()); + + // Run kernel. + #ifdef EIGEN_USE_HIP + hipLaunchKernelGGL( + HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel), + 1, 1, 0, 0, kernel, device_data, capacity); + #else + run_serialized_on_gpu_meta_kernel<<<1,1>>>( + kernel, device_data, capacity); + #endif + // Check pre-launch and kernel execution errors. + GPU_CHECK(gpuGetLastError()); + GPU_CHECK(gpuDeviceSynchronize()); + // Copy back new output to host. + gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost); + gpuFree(device_data); + GPU_CHECK(gpuDeviceSynchronize()); + + // Determine output buffer size. + host_ptr = Eigen::deserialize(host_data, output_data_size); + // If the output doesn't fit in the buffer, spit out warning and fail. + if (output_data_size > capacity) { + std::cerr << "The serialized output does not fit in the output buffer, " + << output_data_size << " vs capacity " << capacity << "." + << std::endl + << "Try specifying a minimum buffer capacity: " << std::endl + << " run_with_hint(" << output_data_size << ", ...)" + << std::endl; + VERIFY(false); + } + + // Deserialize outputs. + auto args_tuple = Eigen::internal::tuple_impl::tie(args...); + EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning. + host_ptr = Eigen::deserialize(host_ptr, Eigen::internal::tuple_impl::get(args_tuple)...); + + // Maybe deserialize return value, properly handling void. + typename void_helper::ReturnType result; + host_ptr = Eigen::deserialize(host_ptr, result); + return void_helper::restore(result); +} + +#endif // EIGEN_GPUCC + +} // namespace internal + +/** + * Runs a kernel on the CPU, returning the results. + * \param kernel kernel to run. + * \param args ... input arguments. + * \return kernel(args...). + */ +template +auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return kernel(std::forward(args)...); +} + +#ifdef EIGEN_GPUCC + +/** + * Runs a kernel on the GPU, returning the results. + * + * The kernel must be able to be passed directly as an input to a global + * function (i.e. empty or POD). Its inputs must be "Serializable" so we + * can transfer them to the device, and the output must be a Serializable value + * type so it can be transfered back from the device. + * + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + */ +template +auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return internal::run_serialized_on_gpu( + /*buffer_capacity_hint=*/ 0, + internal::make_index_sequence{}, + internal::extract_output_indices{}, + kernel, std::forward(args)...); +} + +/** + * Runs a kernel on the GPU, returning the results. + * + * This version allows specifying a minimum buffer capacity size required for + * serializing the puts to transfer results from device to host. Use this when + * `run_on_gpu(...)` fails to determine an appropriate capacity by default. + * + * \param buffer_capacity_hint minimum required buffer size for serializing + * outputs. + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + * \sa run_on_gpu + */ +template +auto run_on_gpu_with_hint(size_t buffer_capacity_hint, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ + return internal::run_serialized_on_gpu( + buffer_capacity_hint, + internal::make_index_sequence{}, + internal::extract_output_indices{}, + kernel, std::forward(args)...); +} + +/** + * Kernel for determining basic Eigen compile-time information + * (i.e. the cuda/hip arch) + */ +struct CompileTimeDeviceInfoKernel { + struct Info { + int cuda; + int hip; + }; + + EIGEN_DEVICE_FUNC + Info operator()() const + { + Info info = {-1, -1}; + #if defined(__CUDA_ARCH__) + info.cuda = static_cast(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info.hip = static_cast(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + return info; + } +}; + +/** + * Queries and prints the compile-time and runtime GPU info. + */ +void print_gpu_device_info() +{ + int device = 0; + gpuDeviceProp_t deviceProp; + gpuGetDeviceProperties(&deviceProp, device); + + auto info = run_on_gpu(CompileTimeDeviceInfoKernel()); + + std::cout << "GPU compile-time info:\n"; + + #ifdef EIGEN_CUDACC + std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl; + #endif + + #ifdef EIGEN_CUDA_SDK_VER + std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl; + #endif + + #ifdef EIGEN_COMP_NVCC + std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl; + #endif + + #ifdef EIGEN_HIPCC + std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl; + #endif + + std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl; + std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl; + + std::cout << "GPU device info:\n"; + std::cout << " name: " << deviceProp.name << std::endl; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; + std::cout << " warpSize: " << deviceProp.warpSize << std::endl; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl; + std::cout << " clockRate: " << deviceProp.clockRate << std::endl; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl; + std::cout << " computeMode: " << deviceProp.computeMode << std::endl; +} + +#endif // EIGEN_GPUCC + +/** + * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. + * + * This is to better support creating generic tests. + * + * The kernel must be able to be passed directly as an input to a global + * function (i.e. empty or POD). Its inputs must be "Serializable" so we + * can transfer them to the device, and the output must be a Serializable value + * type so it can be transfered back from the device. + * + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + */ +template +auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ +#ifdef EIGEN_GPUCC + return run_on_gpu(kernel, std::forward(args)...); +#else + return run_on_cpu(kernel, std::forward(args)...); +#endif +} + +/** + * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. + * + * This version allows specifying a minimum buffer capacity size required for + * serializing the puts to transfer results from device to host. Use this when + * `run(...)` fails to determine an appropriate capacity by default. + * + * \param buffer_capacity_hint minimum required buffer size for serializing + * outputs. + * \param kernel kernel to run. + * \param args ... input arguments, must be "Serializable". + * \return kernel(args...). + * \sa run + */ +template +auto run_with_hint(size_t buffer_capacity_hint, + Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ +#ifdef EIGEN_GPUCC + return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward(args)...); +#else + return run_on_cpu(kernel, std::forward(args)...); +#endif +} + +} // namespace Eigen + +#endif // GPU_TEST_HELPER_H diff --git a/test/main.h b/test/main.h index e4977fc33..73e972eea 100644 --- a/test/main.h +++ b/test/main.h @@ -55,19 +55,26 @@ #endif #endif -// Same for cuda_fp16.h -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) - // Means the compiler is either nvcc or clang with CUDA enabled +// Configure GPU. +#if defined(EIGEN_USE_HIP) + #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + #define EIGEN_HIPCC __HIPCC__ + #include + #include + #endif +#elif defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) #define EIGEN_CUDACC __CUDACC__ + #include + #include + #include + #if CUDA_VERSION >= 7050 + #include + #endif #endif -#if defined(EIGEN_CUDACC) -#include - #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) -#else - #define EIGEN_CUDA_SDK_VER 0 -#endif -#if EIGEN_CUDA_SDK_VER >= 70500 -#include + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) + #define EIGEN_TEST_NO_LONGDOUBLE + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #endif // To test that all calls from Eigen code to std::min() and std::max() are @@ -1081,3 +1088,5 @@ int main(int argc, char *argv[]) // 4503 - decorated name length exceeded, name was truncated #pragma warning( disable : 4503) #endif + +#include "gpu_test_helper.h"