From bf66137efc6bdeebb1085e9a4eeea0eb2cd286c1 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Thu, 26 Aug 2021 14:48:09 -0700
Subject: [PATCH] New GPU test utilities.

This introduces new functions:
```
// returns kernel(args...) running on the CPU.
Eigen::run_on_cpu(Kernel kernel, Args&&... args);

// returns kernel(args...) running on the GPU.
Eigen::run_on_gpu(Kernel kernel, Args&&... args);
Eigen::run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args);

// returns kernel(args...) running on the GPU if using
//   a GPU compiler, or CPU otherwise.
Eigen::run(Kernel kernel, Args&&... args);
Eigen::run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args);
```

Running on the GPU is accomplished by:
- Serializing the kernel inputs on the CPU
- Transferring the inputs to the GPU
- Passing the kernel and serialized inputs to a GPU kernel
- Deserializing the inputs on the GPU
- Running `kernel(inputs...)` on the GPU
- Serializing all output parameters and the return value
- Transferring the serialized outputs back to the CPU
- Deserializing the outputs and return value on the CPU
- Returning the deserialized return value

All inputs must be serializable (currently POD types, `Eigen::Matrix`
and `Eigen::Array`).  The kernel must also  be POD (though usually
contains no actual data).

Tested on CUDA 9.1, 10.2, 11.3, with g++-6, g++-8, g++-10 respectively.

This MR depends on !622, !623, !624.
---
 test/CMakeLists.txt    |   2 +
 test/gpu_example.cu    | 130 ++++++++++++
 test/gpu_test_helper.h | 464 +++++++++++++++++++++++++++++++++++++++++
 test/main.h            |  31 ++-
 4 files changed, 616 insertions(+), 11 deletions(-)
 create mode 100644 test/gpu_example.cu
 create mode 100644 test/gpu_test_helper.h
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 675742c04..5c415b400 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -412,6 +412,7 @@ if(CUDA_FOUND)
   string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
   
+  ei_add_test(gpu_example)
   ei_add_test(gpu_basic)
   
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
@@ -442,6 +443,7 @@ if (EIGEN_TEST_HIP)
 
 	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
 	ei_add_test(gpu_basic)
+  ei_add_test(gpu_example)
 	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 	
       elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
diff --git a/test/gpu_example.cu b/test/gpu_example.cu
new file mode 100644
index 000000000..554fc9b7c
--- /dev/null
+++ b/test/gpu_example.cu
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2021 The Eigen Team.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// The following is an example GPU test.
+
+#include "main.h"  // Include the main test utilities.
+
+// Define a kernel functor.
+//
+// The kernel must be a POD type and implement operator().
+struct AddKernel {
+  // Parameters must be POD or serializable Eigen types (e.g. Matrix,
+  // Array). The return value must be a POD or serializable value type.
+  template<typename Type1, typename Type2, typename Type3>
+  EIGEN_DEVICE_FUNC
+  Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
+    C = A + B;       // Populate output parameter.
+    Type3 D = A + B; // Populate return value.    
+    return D;
+  }
+};
+
+// Define a sub-test that uses the kernel.
+template <typename T>
+void test_add(const T& type) {
+  const Index rows = type.rows();
+  const Index cols = type.cols();
+  
+  // Create random inputs.
+  const T A = T::Random(rows, cols);
+  const T B = T::Random(rows, cols);
+  T C; // Output parameter.
+
+  // Create kernel.
+  AddKernel add_kernel;
+  
+  // Run add_kernel(A, B, C) via run(...).
+  // This will run on the GPU if using a GPU compiler, or CPU otherwise,
+  // facilitating generic tests that can run on either.
+  T D = run(add_kernel, A, B, C);
+  
+  // Check that both output parameter and return value are correctly populated.
+  const T expected = A + B;
+  VERIFY_IS_CWISE_EQUAL(C, expected);
+  VERIFY_IS_CWISE_EQUAL(D, expected);
+  
+  // In a GPU-only test, we can verify that the CPU and GPU produce the
+  // same results.
+  T C_cpu, C_gpu;
+  T D_cpu = run_on_cpu(add_kernel, A, B, C_cpu); // Runs on CPU.
+  T D_gpu = run_on_gpu(add_kernel, A, B, C_gpu); // Runs on GPU.
+  VERIFY_IS_CWISE_EQUAL(C_cpu, C_gpu);
+  VERIFY_IS_CWISE_EQUAL(D_cpu, D_gpu);
+};
+
+struct MultiplyKernel {
+  template<typename Type1, typename Type2, typename Type3>
+  EIGEN_DEVICE_FUNC
+  Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
+    C = A * B;
+    return A * B;
+  }
+};
+
+template <typename T1, typename T2, typename T3>
+void test_multiply(const T1& type1, const T2& type2, const T3& type3) {
+  
+  const T1 A = T1::Random(type1.rows(), type1.cols());
+  const T2 B = T2::Random(type2.rows(), type2.cols());
+  T3 C;
+
+  MultiplyKernel multiply_kernel;
+  
+  // The run(...) family of functions uses a memory buffer to transfer data back
+  // and forth to and from the device.  The size of this buffer is estimated
+  // from the size of all input parameters.  If the estimated buffer size is
+  // not sufficient for transferring outputs from device-to-host, then an
+  // explicit buffer size needs to be specified.
+  
+  // 2 outputs of size (A * B). For each matrix output, the buffer will store
+  // the number of rows, columns, and the data.
+  size_t buffer_capacity_hint = 2 * (                     // 2 output parameters
+    2 * sizeof(typename T3::Index)                        // # Rows, # Cols
+    + A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data
+  
+  T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C);
+  
+  const T3 expected = A * B;
+  VERIFY_IS_CWISE_APPROX(C, expected);
+  VERIFY_IS_CWISE_APPROX(D, expected);
+  
+  T3 C_cpu, C_gpu;
+  T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu);
+  T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint,
+                                  multiply_kernel, A, B, C_gpu);
+  VERIFY_IS_CWISE_APPROX(C_cpu, C_gpu);
+  VERIFY_IS_CWISE_APPROX(D_cpu, D_gpu);
+}
+
+// Declare the test fixture.
+EIGEN_DECLARE_TEST(gpu_example)
+{
+  // For the number of repeats, call the desired subtests.
+  for(int i = 0; i < g_repeat; i++) {    
+    // Call subtests with different sized/typed inputs.
+    CALL_SUBTEST( test_add(Eigen::Vector3f()) );
+    CALL_SUBTEST( test_add(Eigen::Matrix3d()) );
+    CALL_SUBTEST( test_add(Eigen::MatrixX<int>(10, 10)) );
+    
+    CALL_SUBTEST( test_add(Eigen::Array44f()) );
+    CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) );
+    CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) );
+    
+    CALL_SUBTEST( test_multiply(Eigen::Matrix3d(),
+                                Eigen::Matrix3d(),
+                                Eigen::Matrix3d()) );
+    CALL_SUBTEST( test_multiply(Eigen::MatrixX<int>(10, 10),
+                                Eigen::MatrixX<int>(10, 10),
+                                Eigen::MatrixX<int>()) );
+    CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1),
+                                Eigen::MatrixXf(1, 32),
+                                Eigen::MatrixXf()) );
+  }
+}
diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
new file mode 100644
index 000000000..552166613
--- /dev/null
+++ b/test/gpu_test_helper.h
@@ -0,0 +1,464 @@
+#ifndef GPU_TEST_HELPER_H
+#define GPU_TEST_HELPER_H
+
+#include <Eigen/Core>
+
+#ifdef EIGEN_GPUCC
+#define EIGEN_USE_GPU
+#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h"
+#endif // EIGEN_GPUCC
+
+// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
+// doesn't allow std::tuple to compile for host code either. In these cases,
+// use our custom implementation.
+#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
+#define EIGEN_USE_CUSTOM_TUPLE 1
+#else
+#define EIGEN_USE_CUSTOM_TUPLE 0
+#endif
+
+#if EIGEN_USE_CUSTOM_TUPLE
+#include "../Eigen/src/Core/arch/GPU/Tuple.h"
+#else
+#include <tuple>
+#endif
+namespace Eigen {
+
+namespace internal {
+  
+namespace tuple_impl {
+
+// Use std::tuple on CPU, otherwise use the GPU-specific versions.
+#if !EIGEN_USE_CUSTOM_TUPLE
+using std::tuple;
+using std::get;
+using std::make_tuple;
+using std::tie;
+#endif
+#undef EIGEN_USE_CUSTOM_TUPLE
+
+}
+
+template<size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
+struct extract_output_indices_helper;
+
+/**
+ * Extracts a set of indices corresponding to non-const l-value reference
+ * output types.
+ *
+ * \internal
+ * \tparam N the number of types {T1, Ts...}.
+ * \tparam Idx the "index" to append if T1 is an output type.
+ * \tparam OutputIndices the current set of output indices.
+ * \tparam T1 the next type to consider, with index Idx.
+ * \tparam Ts the remaining types.
+ */
+template<size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts>
+struct extract_output_indices_helper<N, Idx, index_sequence<OutputIndices...>, T1, Ts...> {
+  using type = typename
+    extract_output_indices_helper<
+      N - 1, Idx + 1,
+      typename std::conditional<
+        // If is a non-const l-value reference, append index.
+        std::is_lvalue_reference<T1>::value 
+          && !std::is_const<typename std::remove_reference<T1>::type>::value,
+        index_sequence<OutputIndices..., Idx>,
+        index_sequence<OutputIndices...> >::type,
+      Ts...>::type;
+};
+
+// Base case.
+template<size_t Idx, size_t... OutputIndices>
+struct extract_output_indices_helper<0, Idx, index_sequence<OutputIndices...> > {
+  using type = index_sequence<OutputIndices...>;
+};
+
+// Extracts a set of indices into Types... that correspond to non-const
+// l-value references.
+template<typename... Types>
+using extract_output_indices = typename extract_output_indices_helper<sizeof...(Types), 0, index_sequence<>, Types...>::type;
+
+// Helper struct for dealing with Generic functors that may return void.
+struct void_helper {
+  struct Void {};
+  
+  // Converts void -> Void, T otherwise.
+  template<typename T>
+  using ReturnType = typename std::conditional<std::is_same<T, void>::value, Void, T>::type;
+  
+  // Non-void return value.
+  template<typename Func, typename... Args>
+  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
+  auto call(Func&& func, Args&&... args) -> 
+      typename std::enable_if<!std::is_same<decltype(func(args...)), void>::value, 
+                              decltype(func(args...))>::type {
+    return func(std::forward<Args>(args)...);
+  }
+  
+  // Void return value.
+  template<typename Func, typename... Args>
+  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
+  auto call(Func&& func, Args&&... args) -> 
+      typename std::enable_if<std::is_same<decltype(func(args...)), void>::value,
+                              Void>::type {
+    func(std::forward<Args>(args)...);
+    return Void{};
+  }
+  
+  // Restores the original return type, Void -> void, T otherwise.
+  template<typename T>
+  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
+  typename std::enable_if<!std::is_same<typename std::decay<T>::type, Void>::value, T>::type
+  restore(T&& val) {
+    return val;
+  }
+  
+  // Void case.
+  template<typename T = void>
+  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
+  void restore(const Void&) {}
+};
+
+// Runs a kernel via serialized buffer.  Does this by deserializing the buffer
+// to construct the arguments, calling the kernel, then re-serialing the outputs.
+// The buffer contains
+//     [ input_buffer_size, args ]
+// After the kernel call, it is then populated with
+//     [ output_buffer_size, output_parameters, return_value ]
+// If the output_buffer_size exceeds the buffer's capacity, then only the
+// output_buffer_size is populated.
+template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
+EIGEN_DEVICE_FUNC
+void run_serialized(index_sequence<Indices...>, index_sequence<OutputIndices...>,
+                    Kernel kernel, uint8_t* buffer, size_t capacity) {
+  using Eigen::internal::tuple_impl::get;
+  using Eigen::internal::tuple_impl::make_tuple;
+  using Eigen::internal::tuple_impl::tuple;
+  // Deserialize input size and inputs.
+  size_t input_size;
+  uint8_t* buff_ptr = Eigen::deserialize(buffer, input_size);
+  // Create value-type instances to populate.
+  auto args = make_tuple(typename std::decay<Args>::type{}...);
+  EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning.
+  // NVCC 9.1 requires us to spell out the template parameters explicitly.
+  buff_ptr = Eigen::deserialize(buff_ptr, get<Indices, typename std::decay<Args>::type...>(args)...);
+  
+  // Call function, with void->Void conversion so we are guaranteed a complete
+  // output type.
+  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
+  
+  // Determine required output size.
+  size_t output_size = Eigen::serialize_size(capacity);
+  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+  output_size += Eigen::serialize_size(result);
+  
+  // Always serialize required buffer size.
+  buff_ptr = Eigen::serialize(buffer, output_size);
+  // Serialize outputs if they fit in the buffer.
+  if (output_size <= capacity) {
+    // Collect outputs and result.
+    buff_ptr = Eigen::serialize(buff_ptr, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+    buff_ptr = Eigen::serialize(buff_ptr, result);
+  }
+}
+
+template<typename Kernel, typename... Args>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) {
+  run_serialized<Kernel, Args...> (make_index_sequence<sizeof...(Args)>{},
+                                   extract_output_indices<Args...>{},
+                                   kernel, buffer, capacity);
+}
+
+#ifdef EIGEN_GPUCC
+
+// Checks for GPU errors and asserts / prints the error message.
+#define GPU_CHECK(expr)                                                \
+do {                                                                   \
+  gpuError_t err = expr;                                               \
+  if (err != gpuSuccess) {                                             \
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));  \
+    gpu_assert(false);                                                 \
+  }                                                                    \
+} while(0)
+
+// Calls run_serialized on the GPU.
+template<typename Kernel, typename... Args>
+__global__
+EIGEN_HIP_LAUNCH_BOUNDS_1024
+void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) {
+  run_serialized<Kernel, Args...>(kernel, buffer, capacity);
+}
+
+// Runs kernel(args...) on the GPU via the serialization mechanism. 
+//
+// Note: this may end up calling the kernel multiple times if the initial output
+// buffer is not large enough to hold the outputs.
+template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
+auto run_serialized_on_gpu(size_t buffer_capacity_hint,
+                           index_sequence<Indices...>, 
+                           index_sequence<OutputIndices...>,
+                           Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {  
+  // Compute the required serialization buffer capacity.
+  // Round up input size to next power of two to give a little extra room
+  // for outputs.
+  size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...);
+  
+  size_t capacity;
+  if (buffer_capacity_hint == 0) {
+    // Estimate as the power of two larger than the total input size.
+    capacity = sizeof(size_t);
+    while (capacity <= input_data_size) {
+      capacity *= 2;
+    }
+  } else {
+    // Use the larger of the hint and the total input size.
+    // Add sizeof(size_t) to the hint to account for storing the buffer capacity
+    // itself so the user doesn't need to think about this.
+    capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t),
+                                input_data_size);
+  }
+  std::vector<uint8_t> buffer(capacity);
+  
+  uint8_t* host_data = nullptr;
+  uint8_t* host_ptr = nullptr;
+  uint8_t* device_data = nullptr;
+  size_t output_data_size = 0;
+  
+  // Allocate buffers and copy input data.
+  capacity = std::max<size_t>(capacity, output_data_size);
+  buffer.resize(capacity);
+  host_data = buffer.data();
+  host_ptr = Eigen::serialize(host_data, input_data_size);
+  host_ptr = Eigen::serialize(host_ptr, args...);
+  
+  // Copy inputs to host.
+  gpuFree(device_data);
+  gpuMalloc((void**)(&device_data), capacity);
+  gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice);
+  GPU_CHECK(gpuDeviceSynchronize());
+      
+  // Run kernel.
+  #ifdef EIGEN_USE_HIP
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), 
+        1, 1, 0, 0, kernel, device_data, capacity);
+  #else
+    run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1,1>>>(
+        kernel, device_data, capacity);
+  #endif
+  // Check pre-launch and kernel execution errors.
+  GPU_CHECK(gpuGetLastError());
+  GPU_CHECK(gpuDeviceSynchronize());
+  // Copy back new output to host.
+  gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost);
+  gpuFree(device_data);
+  GPU_CHECK(gpuDeviceSynchronize());
+  
+  // Determine output buffer size.
+  host_ptr = Eigen::deserialize(host_data, output_data_size);
+  // If the output doesn't fit in the buffer, spit out warning and fail.
+  if (output_data_size > capacity) {
+    std::cerr << "The serialized output does not fit in the output buffer, "
+              << output_data_size << " vs capacity " << capacity << "."
+              << std::endl
+              << "Try specifying a minimum buffer capacity: " << std::endl
+              << "  run_with_hint(" << output_data_size << ", ...)"
+              << std::endl;
+    VERIFY(false);
+  }
+  
+  // Deserialize outputs.
+  auto args_tuple = Eigen::internal::tuple_impl::tie(args...);
+  EIGEN_UNUSED_VARIABLE(args_tuple)  // Avoid NVCC compile warning.
+  host_ptr = Eigen::deserialize(host_ptr, Eigen::internal::tuple_impl::get<OutputIndices, Args&...>(args_tuple)...);
+  
+  // Maybe deserialize return value, properly handling void.
+  typename void_helper::ReturnType<decltype(kernel(args...))> result;
+  host_ptr = Eigen::deserialize(host_ptr, result);
+  return void_helper::restore(result);
+}
+
+#endif // EIGEN_GPUCC
+
+} // namespace internal
+
+/**
+ * Runs a kernel on the CPU, returning the results.
+ * \param kernel kernel to run.
+ * \param args ... input arguments.
+ * \return kernel(args...).
+ */
+template<typename Kernel, typename... Args>
+auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){  
+  return kernel(std::forward<Args>(args)...);
+}
+
+#ifdef EIGEN_GPUCC
+
+/**
+ * Runs a kernel on the GPU, returning the results.
+ * 
+ * The kernel must be able to be passed directly as an input to a global
+ * function (i.e. empty or POD).  Its inputs must be "Serializable" so we
+ * can transfer them to the device, and the output must be a Serializable value
+ * type so it can be transfered back from the device.
+ * 
+ * \param kernel kernel to run.
+ * \param args ... input arguments, must be "Serializable".
+ * \return kernel(args...).
+ */
+template<typename Kernel, typename... Args>
+auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){  
+  return internal::run_serialized_on_gpu<Kernel, Args...>(
+      /*buffer_capacity_hint=*/ 0,
+      internal::make_index_sequence<sizeof...(Args)>{},
+      internal::extract_output_indices<Args...>{},
+      kernel, std::forward<Args>(args)...);
+}
+
+/**
+ * Runs a kernel on the GPU, returning the results.
+ *
+ * This version allows specifying a minimum buffer capacity size required for
+ * serializing the puts to transfer results from device to host.  Use this when
+ * `run_on_gpu(...)` fails to determine an appropriate capacity by default.
+ *
+ * \param buffer_capacity_hint minimum required buffer size for serializing
+ *        outputs.
+ * \param kernel kernel to run.
+ * \param args ... input arguments, must be "Serializable".
+ * \return kernel(args...).
+ * \sa run_on_gpu
+ */
+template<typename Kernel, typename... Args>
+auto run_on_gpu_with_hint(size_t buffer_capacity_hint, 
+    Kernel kernel, Args&&... args) -> decltype(kernel(args...)){  
+  return internal::run_serialized_on_gpu<Kernel, Args...>(
+      buffer_capacity_hint,
+      internal::make_index_sequence<sizeof...(Args)>{},
+      internal::extract_output_indices<Args...>{},
+      kernel, std::forward<Args>(args)...);
+}
+
+/**
+ * Kernel for determining basic Eigen compile-time information
+ * (i.e. the cuda/hip arch)
+ */
+struct CompileTimeDeviceInfoKernel {
+  struct Info {
+    int cuda;
+    int hip;
+  };
+  
+  EIGEN_DEVICE_FUNC
+  Info operator()() const
+  {
+    Info info = {-1, -1};
+    #if defined(__CUDA_ARCH__)
+    info.cuda = static_cast<int>(__CUDA_ARCH__ +0);
+    #endif
+    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE +0);
+    #endif
+    return info;
+  }
+};
+
+/**
+ * Queries and prints the compile-time and runtime GPU info.
+ */
+void print_gpu_device_info()
+{
+  int device = 0;
+  gpuDeviceProp_t deviceProp;
+  gpuGetDeviceProperties(&deviceProp, device);
+
+  auto info = run_on_gpu(CompileTimeDeviceInfoKernel());
+
+  std::cout << "GPU compile-time info:\n";
+  
+  #ifdef EIGEN_CUDACC
+  std::cout << "  EIGEN_CUDACC:                " << int(EIGEN_CUDACC) << std::endl;
+  #endif
+  
+  #ifdef EIGEN_CUDA_SDK_VER
+  std::cout << "  EIGEN_CUDA_SDK_VER:          " << int(EIGEN_CUDA_SDK_VER) << std::endl;
+  #endif
+
+  #ifdef EIGEN_COMP_NVCC
+  std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << std::endl;
+  #endif
+  
+  #ifdef EIGEN_HIPCC
+  std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << std::endl;
+  #endif
+
+  std::cout << "  EIGEN_CUDA_ARCH:             " << info.cuda << std::endl;  
+  std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info.hip << std::endl;
+
+  std::cout << "GPU device info:\n";
+  std::cout << "  name:                        " << deviceProp.name << std::endl;
+  std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << std::endl;
+  std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << std::endl;
+  std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+  std::cout << "  warpSize:                    " << deviceProp.warpSize << std::endl;
+  std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << std::endl;
+  std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << std::endl;
+  std::cout << "  clockRate:                   " << deviceProp.clockRate << std::endl;
+  std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << std::endl;
+  std::cout << "  computeMode:                 " << deviceProp.computeMode << std::endl;
+}
+
+#endif // EIGEN_GPUCC
+
+/**
+ * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise.
+ * 
+ * This is to better support creating generic tests.
+ * 
+ * The kernel must be able to be passed directly as an input to a global
+ * function (i.e. empty or POD).  Its inputs must be "Serializable" so we
+ * can transfer them to the device, and the output must be a Serializable value
+ * type so it can be transfered back from the device.
+ * 
+ * \param kernel kernel to run.
+ * \param args ... input arguments, must be "Serializable".
+ * \return kernel(args...).
+ */
+template<typename Kernel, typename... Args>
+auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
+#ifdef EIGEN_GPUCC
+  return run_on_gpu(kernel, std::forward<Args>(args)...);
+#else
+  return run_on_cpu(kernel, std::forward<Args>(args)...);
+#endif
+}
+
+/**
+ * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise.
+ * 
+ * This version allows specifying a minimum buffer capacity size required for
+ * serializing the puts to transfer results from device to host.  Use this when
+ * `run(...)` fails to determine an appropriate capacity by default.
+ *
+ * \param buffer_capacity_hint minimum required buffer size for serializing
+ *        outputs.
+ * \param kernel kernel to run.
+ * \param args ... input arguments, must be "Serializable".
+ * \return kernel(args...).
+ * \sa run
+ */
+template<typename Kernel, typename... Args>
+auto run_with_hint(size_t buffer_capacity_hint,
+    Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
+#ifdef EIGEN_GPUCC
+  return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
+#else
+  return run_on_cpu(kernel, std::forward<Args>(args)...);
+#endif
+}
+
+} // namespace Eigen
+
+#endif // GPU_TEST_HELPER_H
diff --git a/test/main.h b/test/main.h
index e4977fc33..73e972eea 100644
--- a/test/main.h
+++ b/test/main.h
@@ -55,19 +55,26 @@
 #endif
 #endif
 
-// Same for cuda_fp16.h
-#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
-  // Means the compiler is either nvcc or clang with CUDA enabled
+// Configure GPU.
+#if defined(EIGEN_USE_HIP)
+  #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+    #define EIGEN_HIPCC __HIPCC__
+    #include <hip/hip_runtime.h>
+    #include <hip/hip_runtime_api.h>
+  #endif
+#elif defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
   #define EIGEN_CUDACC __CUDACC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+  #include <cuda_runtime_api.h>
+  #if CUDA_VERSION >= 7050
+    #include <cuda_fp16.h>
+  #endif
 #endif
-#if defined(EIGEN_CUDACC)
-#include <cuda.h>
-  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
-#else
-  #define EIGEN_CUDA_SDK_VER 0
-#endif
-#if EIGEN_CUDA_SDK_VER >= 70500
-#include <cuda_fp16.h>
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+  #define EIGEN_TEST_NO_LONGDOUBLE
+  #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #endif
 
 // To test that all calls from Eigen code to std::min() and std::max() are
@@ -1081,3 +1088,5 @@ int main(int argc, char *argv[])
   // 4503 - decorated name length exceeded, name was truncated
   #pragma warning( disable : 4503)
 #endif
+
+#include "gpu_test_helper.h"