mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-12 03:39:01 +08:00
New GPU test utilities.
This introduces new functions: ``` // returns kernel(args...) running on the CPU. Eigen::run_on_cpu(Kernel kernel, Args&&... args); // returns kernel(args...) running on the GPU. Eigen::run_on_gpu(Kernel kernel, Args&&... args); Eigen::run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args); // returns kernel(args...) running on the GPU if using // a GPU compiler, or CPU otherwise. Eigen::run(Kernel kernel, Args&&... args); Eigen::run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args); ``` Running on the GPU is accomplished by: - Serializing the kernel inputs on the CPU - Transferring the inputs to the GPU - Passing the kernel and serialized inputs to a GPU kernel - Deserializing the inputs on the GPU - Running `kernel(inputs...)` on the GPU - Serializing all output parameters and the return value - Transferring the serialized outputs back to the CPU - Deserializing the outputs and return value on the CPU - Returning the deserialized return value All inputs must be serializable (currently POD types, `Eigen::Matrix` and `Eigen::Array`). The kernel must also be POD (though usually contains no actual data). Tested on CUDA 9.1, 10.2, 11.3, with g++-6, g++-8, g++-10 respectively. This MR depends on !622, !623, !624.
This commit is contained in:
parent
d7d0bf832d
commit
bf66137efc
@ -412,6 +412,7 @@ if(CUDA_FOUND)
|
|||||||
string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
|
string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
|
||||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
|
||||||
|
ei_add_test(gpu_example)
|
||||||
ei_add_test(gpu_basic)
|
ei_add_test(gpu_basic)
|
||||||
|
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
@ -442,6 +443,7 @@ if (EIGEN_TEST_HIP)
|
|||||||
|
|
||||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
ei_add_test(gpu_basic)
|
ei_add_test(gpu_basic)
|
||||||
|
ei_add_test(gpu_example)
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
|
||||||
elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
|
elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
|
||||||
|
130
test/gpu_example.cu
Normal file
130
test/gpu_example.cu
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2021 The Eigen Team.
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// The following is an example GPU test.
|
||||||
|
|
||||||
|
#include "main.h" // Include the main test utilities.
|
||||||
|
|
||||||
|
// Define a kernel functor.
|
||||||
|
//
|
||||||
|
// The kernel must be a POD type and implement operator().
|
||||||
|
struct AddKernel {
|
||||||
|
// Parameters must be POD or serializable Eigen types (e.g. Matrix,
|
||||||
|
// Array). The return value must be a POD or serializable value type.
|
||||||
|
template<typename Type1, typename Type2, typename Type3>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
|
||||||
|
C = A + B; // Populate output parameter.
|
||||||
|
Type3 D = A + B; // Populate return value.
|
||||||
|
return D;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Define a sub-test that uses the kernel.
|
||||||
|
template <typename T>
|
||||||
|
void test_add(const T& type) {
|
||||||
|
const Index rows = type.rows();
|
||||||
|
const Index cols = type.cols();
|
||||||
|
|
||||||
|
// Create random inputs.
|
||||||
|
const T A = T::Random(rows, cols);
|
||||||
|
const T B = T::Random(rows, cols);
|
||||||
|
T C; // Output parameter.
|
||||||
|
|
||||||
|
// Create kernel.
|
||||||
|
AddKernel add_kernel;
|
||||||
|
|
||||||
|
// Run add_kernel(A, B, C) via run(...).
|
||||||
|
// This will run on the GPU if using a GPU compiler, or CPU otherwise,
|
||||||
|
// facilitating generic tests that can run on either.
|
||||||
|
T D = run(add_kernel, A, B, C);
|
||||||
|
|
||||||
|
// Check that both output parameter and return value are correctly populated.
|
||||||
|
const T expected = A + B;
|
||||||
|
VERIFY_IS_CWISE_EQUAL(C, expected);
|
||||||
|
VERIFY_IS_CWISE_EQUAL(D, expected);
|
||||||
|
|
||||||
|
// In a GPU-only test, we can verify that the CPU and GPU produce the
|
||||||
|
// same results.
|
||||||
|
T C_cpu, C_gpu;
|
||||||
|
T D_cpu = run_on_cpu(add_kernel, A, B, C_cpu); // Runs on CPU.
|
||||||
|
T D_gpu = run_on_gpu(add_kernel, A, B, C_gpu); // Runs on GPU.
|
||||||
|
VERIFY_IS_CWISE_EQUAL(C_cpu, C_gpu);
|
||||||
|
VERIFY_IS_CWISE_EQUAL(D_cpu, D_gpu);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct MultiplyKernel {
|
||||||
|
template<typename Type1, typename Type2, typename Type3>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
Type3 operator()(const Type1& A, const Type2& B, Type3& C) const {
|
||||||
|
C = A * B;
|
||||||
|
return A * B;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T1, typename T2, typename T3>
|
||||||
|
void test_multiply(const T1& type1, const T2& type2, const T3& type3) {
|
||||||
|
|
||||||
|
const T1 A = T1::Random(type1.rows(), type1.cols());
|
||||||
|
const T2 B = T2::Random(type2.rows(), type2.cols());
|
||||||
|
T3 C;
|
||||||
|
|
||||||
|
MultiplyKernel multiply_kernel;
|
||||||
|
|
||||||
|
// The run(...) family of functions uses a memory buffer to transfer data back
|
||||||
|
// and forth to and from the device. The size of this buffer is estimated
|
||||||
|
// from the size of all input parameters. If the estimated buffer size is
|
||||||
|
// not sufficient for transferring outputs from device-to-host, then an
|
||||||
|
// explicit buffer size needs to be specified.
|
||||||
|
|
||||||
|
// 2 outputs of size (A * B). For each matrix output, the buffer will store
|
||||||
|
// the number of rows, columns, and the data.
|
||||||
|
size_t buffer_capacity_hint = 2 * ( // 2 output parameters
|
||||||
|
2 * sizeof(typename T3::Index) // # Rows, # Cols
|
||||||
|
+ A.rows() * B.cols() * sizeof(typename T3::Scalar)); // Output data
|
||||||
|
|
||||||
|
T3 D = run_with_hint(buffer_capacity_hint, multiply_kernel, A, B, C);
|
||||||
|
|
||||||
|
const T3 expected = A * B;
|
||||||
|
VERIFY_IS_CWISE_APPROX(C, expected);
|
||||||
|
VERIFY_IS_CWISE_APPROX(D, expected);
|
||||||
|
|
||||||
|
T3 C_cpu, C_gpu;
|
||||||
|
T3 D_cpu = run_on_cpu(multiply_kernel, A, B, C_cpu);
|
||||||
|
T3 D_gpu = run_on_gpu_with_hint(buffer_capacity_hint,
|
||||||
|
multiply_kernel, A, B, C_gpu);
|
||||||
|
VERIFY_IS_CWISE_APPROX(C_cpu, C_gpu);
|
||||||
|
VERIFY_IS_CWISE_APPROX(D_cpu, D_gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Declare the test fixture.
|
||||||
|
EIGEN_DECLARE_TEST(gpu_example)
|
||||||
|
{
|
||||||
|
// For the number of repeats, call the desired subtests.
|
||||||
|
for(int i = 0; i < g_repeat; i++) {
|
||||||
|
// Call subtests with different sized/typed inputs.
|
||||||
|
CALL_SUBTEST( test_add(Eigen::Vector3f()) );
|
||||||
|
CALL_SUBTEST( test_add(Eigen::Matrix3d()) );
|
||||||
|
CALL_SUBTEST( test_add(Eigen::MatrixX<int>(10, 10)) );
|
||||||
|
|
||||||
|
CALL_SUBTEST( test_add(Eigen::Array44f()) );
|
||||||
|
CALL_SUBTEST( test_add(Eigen::ArrayXd(20)) );
|
||||||
|
CALL_SUBTEST( test_add(Eigen::ArrayXXi(13, 17)) );
|
||||||
|
|
||||||
|
CALL_SUBTEST( test_multiply(Eigen::Matrix3d(),
|
||||||
|
Eigen::Matrix3d(),
|
||||||
|
Eigen::Matrix3d()) );
|
||||||
|
CALL_SUBTEST( test_multiply(Eigen::MatrixX<int>(10, 10),
|
||||||
|
Eigen::MatrixX<int>(10, 10),
|
||||||
|
Eigen::MatrixX<int>()) );
|
||||||
|
CALL_SUBTEST( test_multiply(Eigen::MatrixXf(12, 1),
|
||||||
|
Eigen::MatrixXf(1, 32),
|
||||||
|
Eigen::MatrixXf()) );
|
||||||
|
}
|
||||||
|
}
|
464
test/gpu_test_helper.h
Normal file
464
test/gpu_test_helper.h
Normal file
@ -0,0 +1,464 @@
|
|||||||
|
#ifndef GPU_TEST_HELPER_H
|
||||||
|
#define GPU_TEST_HELPER_H
|
||||||
|
|
||||||
|
#include <Eigen/Core>
|
||||||
|
|
||||||
|
#ifdef EIGEN_GPUCC
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h"
|
||||||
|
#endif // EIGEN_GPUCC
|
||||||
|
|
||||||
|
// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
|
||||||
|
// doesn't allow std::tuple to compile for host code either. In these cases,
|
||||||
|
// use our custom implementation.
|
||||||
|
#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
|
||||||
|
#define EIGEN_USE_CUSTOM_TUPLE 1
|
||||||
|
#else
|
||||||
|
#define EIGEN_USE_CUSTOM_TUPLE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if EIGEN_USE_CUSTOM_TUPLE
|
||||||
|
#include "../Eigen/src/Core/arch/GPU/Tuple.h"
|
||||||
|
#else
|
||||||
|
#include <tuple>
|
||||||
|
#endif
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
namespace tuple_impl {
|
||||||
|
|
||||||
|
// Use std::tuple on CPU, otherwise use the GPU-specific versions.
|
||||||
|
#if !EIGEN_USE_CUSTOM_TUPLE
|
||||||
|
using std::tuple;
|
||||||
|
using std::get;
|
||||||
|
using std::make_tuple;
|
||||||
|
using std::tie;
|
||||||
|
#endif
|
||||||
|
#undef EIGEN_USE_CUSTOM_TUPLE
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
|
||||||
|
struct extract_output_indices_helper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts a set of indices corresponding to non-const l-value reference
|
||||||
|
* output types.
|
||||||
|
*
|
||||||
|
* \internal
|
||||||
|
* \tparam N the number of types {T1, Ts...}.
|
||||||
|
* \tparam Idx the "index" to append if T1 is an output type.
|
||||||
|
* \tparam OutputIndices the current set of output indices.
|
||||||
|
* \tparam T1 the next type to consider, with index Idx.
|
||||||
|
* \tparam Ts the remaining types.
|
||||||
|
*/
|
||||||
|
template<size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts>
|
||||||
|
struct extract_output_indices_helper<N, Idx, index_sequence<OutputIndices...>, T1, Ts...> {
|
||||||
|
using type = typename
|
||||||
|
extract_output_indices_helper<
|
||||||
|
N - 1, Idx + 1,
|
||||||
|
typename std::conditional<
|
||||||
|
// If is a non-const l-value reference, append index.
|
||||||
|
std::is_lvalue_reference<T1>::value
|
||||||
|
&& !std::is_const<typename std::remove_reference<T1>::type>::value,
|
||||||
|
index_sequence<OutputIndices..., Idx>,
|
||||||
|
index_sequence<OutputIndices...> >::type,
|
||||||
|
Ts...>::type;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Base case.
|
||||||
|
template<size_t Idx, size_t... OutputIndices>
|
||||||
|
struct extract_output_indices_helper<0, Idx, index_sequence<OutputIndices...> > {
|
||||||
|
using type = index_sequence<OutputIndices...>;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extracts a set of indices into Types... that correspond to non-const
|
||||||
|
// l-value references.
|
||||||
|
template<typename... Types>
|
||||||
|
using extract_output_indices = typename extract_output_indices_helper<sizeof...(Types), 0, index_sequence<>, Types...>::type;
|
||||||
|
|
||||||
|
// Helper struct for dealing with Generic functors that may return void.
|
||||||
|
struct void_helper {
|
||||||
|
struct Void {};
|
||||||
|
|
||||||
|
// Converts void -> Void, T otherwise.
|
||||||
|
template<typename T>
|
||||||
|
using ReturnType = typename std::conditional<std::is_same<T, void>::value, Void, T>::type;
|
||||||
|
|
||||||
|
// Non-void return value.
|
||||||
|
template<typename Func, typename... Args>
|
||||||
|
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
|
||||||
|
auto call(Func&& func, Args&&... args) ->
|
||||||
|
typename std::enable_if<!std::is_same<decltype(func(args...)), void>::value,
|
||||||
|
decltype(func(args...))>::type {
|
||||||
|
return func(std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Void return value.
|
||||||
|
template<typename Func, typename... Args>
|
||||||
|
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
|
||||||
|
auto call(Func&& func, Args&&... args) ->
|
||||||
|
typename std::enable_if<std::is_same<decltype(func(args...)), void>::value,
|
||||||
|
Void>::type {
|
||||||
|
func(std::forward<Args>(args)...);
|
||||||
|
return Void{};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restores the original return type, Void -> void, T otherwise.
|
||||||
|
template<typename T>
|
||||||
|
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
|
||||||
|
typename std::enable_if<!std::is_same<typename std::decay<T>::type, Void>::value, T>::type
|
||||||
|
restore(T&& val) {
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Void case.
|
||||||
|
template<typename T = void>
|
||||||
|
static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC
|
||||||
|
void restore(const Void&) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Runs a kernel via serialized buffer. Does this by deserializing the buffer
|
||||||
|
// to construct the arguments, calling the kernel, then re-serialing the outputs.
|
||||||
|
// The buffer contains
|
||||||
|
// [ input_buffer_size, args ]
|
||||||
|
// After the kernel call, it is then populated with
|
||||||
|
// [ output_buffer_size, output_parameters, return_value ]
|
||||||
|
// If the output_buffer_size exceeds the buffer's capacity, then only the
|
||||||
|
// output_buffer_size is populated.
|
||||||
|
template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
void run_serialized(index_sequence<Indices...>, index_sequence<OutputIndices...>,
|
||||||
|
Kernel kernel, uint8_t* buffer, size_t capacity) {
|
||||||
|
using Eigen::internal::tuple_impl::get;
|
||||||
|
using Eigen::internal::tuple_impl::make_tuple;
|
||||||
|
using Eigen::internal::tuple_impl::tuple;
|
||||||
|
// Deserialize input size and inputs.
|
||||||
|
size_t input_size;
|
||||||
|
uint8_t* buff_ptr = Eigen::deserialize(buffer, input_size);
|
||||||
|
// Create value-type instances to populate.
|
||||||
|
auto args = make_tuple(typename std::decay<Args>::type{}...);
|
||||||
|
EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning.
|
||||||
|
// NVCC 9.1 requires us to spell out the template parameters explicitly.
|
||||||
|
buff_ptr = Eigen::deserialize(buff_ptr, get<Indices, typename std::decay<Args>::type...>(args)...);
|
||||||
|
|
||||||
|
// Call function, with void->Void conversion so we are guaranteed a complete
|
||||||
|
// output type.
|
||||||
|
auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
|
||||||
|
|
||||||
|
// Determine required output size.
|
||||||
|
size_t output_size = Eigen::serialize_size(capacity);
|
||||||
|
output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
|
||||||
|
output_size += Eigen::serialize_size(result);
|
||||||
|
|
||||||
|
// Always serialize required buffer size.
|
||||||
|
buff_ptr = Eigen::serialize(buffer, output_size);
|
||||||
|
// Serialize outputs if they fit in the buffer.
|
||||||
|
if (output_size <= capacity) {
|
||||||
|
// Collect outputs and result.
|
||||||
|
buff_ptr = Eigen::serialize(buff_ptr, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
|
||||||
|
buff_ptr = Eigen::serialize(buff_ptr, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) {
|
||||||
|
run_serialized<Kernel, Args...> (make_index_sequence<sizeof...(Args)>{},
|
||||||
|
extract_output_indices<Args...>{},
|
||||||
|
kernel, buffer, capacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_GPUCC
|
||||||
|
|
||||||
|
// Checks for GPU errors and asserts / prints the error message.
|
||||||
|
#define GPU_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
gpuError_t err = expr; \
|
||||||
|
if (err != gpuSuccess) { \
|
||||||
|
printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \
|
||||||
|
gpu_assert(false); \
|
||||||
|
} \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
// Calls run_serialized on the GPU.
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
__global__
|
||||||
|
EIGEN_HIP_LAUNCH_BOUNDS_1024
|
||||||
|
void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) {
|
||||||
|
run_serialized<Kernel, Args...>(kernel, buffer, capacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs kernel(args...) on the GPU via the serialization mechanism.
|
||||||
|
//
|
||||||
|
// Note: this may end up calling the kernel multiple times if the initial output
|
||||||
|
// buffer is not large enough to hold the outputs.
|
||||||
|
template<typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices>
|
||||||
|
auto run_serialized_on_gpu(size_t buffer_capacity_hint,
|
||||||
|
index_sequence<Indices...>,
|
||||||
|
index_sequence<OutputIndices...>,
|
||||||
|
Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
|
||||||
|
// Compute the required serialization buffer capacity.
|
||||||
|
// Round up input size to next power of two to give a little extra room
|
||||||
|
// for outputs.
|
||||||
|
size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...);
|
||||||
|
|
||||||
|
size_t capacity;
|
||||||
|
if (buffer_capacity_hint == 0) {
|
||||||
|
// Estimate as the power of two larger than the total input size.
|
||||||
|
capacity = sizeof(size_t);
|
||||||
|
while (capacity <= input_data_size) {
|
||||||
|
capacity *= 2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Use the larger of the hint and the total input size.
|
||||||
|
// Add sizeof(size_t) to the hint to account for storing the buffer capacity
|
||||||
|
// itself so the user doesn't need to think about this.
|
||||||
|
capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t),
|
||||||
|
input_data_size);
|
||||||
|
}
|
||||||
|
std::vector<uint8_t> buffer(capacity);
|
||||||
|
|
||||||
|
uint8_t* host_data = nullptr;
|
||||||
|
uint8_t* host_ptr = nullptr;
|
||||||
|
uint8_t* device_data = nullptr;
|
||||||
|
size_t output_data_size = 0;
|
||||||
|
|
||||||
|
// Allocate buffers and copy input data.
|
||||||
|
capacity = std::max<size_t>(capacity, output_data_size);
|
||||||
|
buffer.resize(capacity);
|
||||||
|
host_data = buffer.data();
|
||||||
|
host_ptr = Eigen::serialize(host_data, input_data_size);
|
||||||
|
host_ptr = Eigen::serialize(host_ptr, args...);
|
||||||
|
|
||||||
|
// Copy inputs to host.
|
||||||
|
gpuFree(device_data);
|
||||||
|
gpuMalloc((void**)(&device_data), capacity);
|
||||||
|
gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice);
|
||||||
|
GPU_CHECK(gpuDeviceSynchronize());
|
||||||
|
|
||||||
|
// Run kernel.
|
||||||
|
#ifdef EIGEN_USE_HIP
|
||||||
|
hipLaunchKernelGGL(
|
||||||
|
HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>),
|
||||||
|
1, 1, 0, 0, kernel, device_data, capacity);
|
||||||
|
#else
|
||||||
|
run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1,1>>>(
|
||||||
|
kernel, device_data, capacity);
|
||||||
|
#endif
|
||||||
|
// Check pre-launch and kernel execution errors.
|
||||||
|
GPU_CHECK(gpuGetLastError());
|
||||||
|
GPU_CHECK(gpuDeviceSynchronize());
|
||||||
|
// Copy back new output to host.
|
||||||
|
gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost);
|
||||||
|
gpuFree(device_data);
|
||||||
|
GPU_CHECK(gpuDeviceSynchronize());
|
||||||
|
|
||||||
|
// Determine output buffer size.
|
||||||
|
host_ptr = Eigen::deserialize(host_data, output_data_size);
|
||||||
|
// If the output doesn't fit in the buffer, spit out warning and fail.
|
||||||
|
if (output_data_size > capacity) {
|
||||||
|
std::cerr << "The serialized output does not fit in the output buffer, "
|
||||||
|
<< output_data_size << " vs capacity " << capacity << "."
|
||||||
|
<< std::endl
|
||||||
|
<< "Try specifying a minimum buffer capacity: " << std::endl
|
||||||
|
<< " run_with_hint(" << output_data_size << ", ...)"
|
||||||
|
<< std::endl;
|
||||||
|
VERIFY(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deserialize outputs.
|
||||||
|
auto args_tuple = Eigen::internal::tuple_impl::tie(args...);
|
||||||
|
EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning.
|
||||||
|
host_ptr = Eigen::deserialize(host_ptr, Eigen::internal::tuple_impl::get<OutputIndices, Args&...>(args_tuple)...);
|
||||||
|
|
||||||
|
// Maybe deserialize return value, properly handling void.
|
||||||
|
typename void_helper::ReturnType<decltype(kernel(args...))> result;
|
||||||
|
host_ptr = Eigen::deserialize(host_ptr, result);
|
||||||
|
return void_helper::restore(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // EIGEN_GPUCC
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a kernel on the CPU, returning the results.
|
||||||
|
* \param kernel kernel to run.
|
||||||
|
* \param args ... input arguments.
|
||||||
|
* \return kernel(args...).
|
||||||
|
*/
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
|
||||||
|
return kernel(std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EIGEN_GPUCC
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a kernel on the GPU, returning the results.
|
||||||
|
*
|
||||||
|
* The kernel must be able to be passed directly as an input to a global
|
||||||
|
* function (i.e. empty or POD). Its inputs must be "Serializable" so we
|
||||||
|
* can transfer them to the device, and the output must be a Serializable value
|
||||||
|
* type so it can be transfered back from the device.
|
||||||
|
*
|
||||||
|
* \param kernel kernel to run.
|
||||||
|
* \param args ... input arguments, must be "Serializable".
|
||||||
|
* \return kernel(args...).
|
||||||
|
*/
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
|
||||||
|
return internal::run_serialized_on_gpu<Kernel, Args...>(
|
||||||
|
/*buffer_capacity_hint=*/ 0,
|
||||||
|
internal::make_index_sequence<sizeof...(Args)>{},
|
||||||
|
internal::extract_output_indices<Args...>{},
|
||||||
|
kernel, std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a kernel on the GPU, returning the results.
|
||||||
|
*
|
||||||
|
* This version allows specifying a minimum buffer capacity size required for
|
||||||
|
* serializing the puts to transfer results from device to host. Use this when
|
||||||
|
* `run_on_gpu(...)` fails to determine an appropriate capacity by default.
|
||||||
|
*
|
||||||
|
* \param buffer_capacity_hint minimum required buffer size for serializing
|
||||||
|
* outputs.
|
||||||
|
* \param kernel kernel to run.
|
||||||
|
* \param args ... input arguments, must be "Serializable".
|
||||||
|
* \return kernel(args...).
|
||||||
|
* \sa run_on_gpu
|
||||||
|
*/
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
auto run_on_gpu_with_hint(size_t buffer_capacity_hint,
|
||||||
|
Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
|
||||||
|
return internal::run_serialized_on_gpu<Kernel, Args...>(
|
||||||
|
buffer_capacity_hint,
|
||||||
|
internal::make_index_sequence<sizeof...(Args)>{},
|
||||||
|
internal::extract_output_indices<Args...>{},
|
||||||
|
kernel, std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kernel for determining basic Eigen compile-time information
|
||||||
|
* (i.e. the cuda/hip arch)
|
||||||
|
*/
|
||||||
|
struct CompileTimeDeviceInfoKernel {
|
||||||
|
struct Info {
|
||||||
|
int cuda;
|
||||||
|
int hip;
|
||||||
|
};
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
Info operator()() const
|
||||||
|
{
|
||||||
|
Info info = {-1, -1};
|
||||||
|
#if defined(__CUDA_ARCH__)
|
||||||
|
info.cuda = static_cast<int>(__CUDA_ARCH__ +0);
|
||||||
|
#endif
|
||||||
|
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||||
|
info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE +0);
|
||||||
|
#endif
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Queries and prints the compile-time and runtime GPU info.
|
||||||
|
*/
|
||||||
|
void print_gpu_device_info()
|
||||||
|
{
|
||||||
|
int device = 0;
|
||||||
|
gpuDeviceProp_t deviceProp;
|
||||||
|
gpuGetDeviceProperties(&deviceProp, device);
|
||||||
|
|
||||||
|
auto info = run_on_gpu(CompileTimeDeviceInfoKernel());
|
||||||
|
|
||||||
|
std::cout << "GPU compile-time info:\n";
|
||||||
|
|
||||||
|
#ifdef EIGEN_CUDACC
|
||||||
|
std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_CUDA_SDK_VER
|
||||||
|
std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_COMP_NVCC
|
||||||
|
std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EIGEN_HIPCC
|
||||||
|
std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl;
|
||||||
|
std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl;
|
||||||
|
|
||||||
|
std::cout << "GPU device info:\n";
|
||||||
|
std::cout << " name: " << deviceProp.name << std::endl;
|
||||||
|
std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
|
||||||
|
std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl;
|
||||||
|
std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
|
||||||
|
std::cout << " warpSize: " << deviceProp.warpSize << std::endl;
|
||||||
|
std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl;
|
||||||
|
std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl;
|
||||||
|
std::cout << " clockRate: " << deviceProp.clockRate << std::endl;
|
||||||
|
std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl;
|
||||||
|
std::cout << " computeMode: " << deviceProp.computeMode << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // EIGEN_GPUCC
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise.
|
||||||
|
*
|
||||||
|
* This is to better support creating generic tests.
|
||||||
|
*
|
||||||
|
* The kernel must be able to be passed directly as an input to a global
|
||||||
|
* function (i.e. empty or POD). Its inputs must be "Serializable" so we
|
||||||
|
* can transfer them to the device, and the output must be a Serializable value
|
||||||
|
* type so it can be transfered back from the device.
|
||||||
|
*
|
||||||
|
* \param kernel kernel to run.
|
||||||
|
* \param args ... input arguments, must be "Serializable".
|
||||||
|
* \return kernel(args...).
|
||||||
|
*/
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
|
||||||
|
#ifdef EIGEN_GPUCC
|
||||||
|
return run_on_gpu(kernel, std::forward<Args>(args)...);
|
||||||
|
#else
|
||||||
|
return run_on_cpu(kernel, std::forward<Args>(args)...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise.
|
||||||
|
*
|
||||||
|
* This version allows specifying a minimum buffer capacity size required for
|
||||||
|
* serializing the puts to transfer results from device to host. Use this when
|
||||||
|
* `run(...)` fails to determine an appropriate capacity by default.
|
||||||
|
*
|
||||||
|
* \param buffer_capacity_hint minimum required buffer size for serializing
|
||||||
|
* outputs.
|
||||||
|
* \param kernel kernel to run.
|
||||||
|
* \param args ... input arguments, must be "Serializable".
|
||||||
|
* \return kernel(args...).
|
||||||
|
* \sa run
|
||||||
|
*/
|
||||||
|
template<typename Kernel, typename... Args>
|
||||||
|
auto run_with_hint(size_t buffer_capacity_hint,
|
||||||
|
Kernel kernel, Args&&... args) -> decltype(kernel(args...)){
|
||||||
|
#ifdef EIGEN_GPUCC
|
||||||
|
return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
|
||||||
|
#else
|
||||||
|
return run_on_cpu(kernel, std::forward<Args>(args)...);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // GPU_TEST_HELPER_H
|
29
test/main.h
29
test/main.h
@ -55,20 +55,27 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Same for cuda_fp16.h
|
// Configure GPU.
|
||||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
#if defined(EIGEN_USE_HIP)
|
||||||
// Means the compiler is either nvcc or clang with CUDA enabled
|
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
|
||||||
|
#define EIGEN_HIPCC __HIPCC__
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#include <hip/hip_runtime_api.h>
|
||||||
|
#endif
|
||||||
|
#elif defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||||
#define EIGEN_CUDACC __CUDACC__
|
#define EIGEN_CUDACC __CUDACC__
|
||||||
#endif
|
|
||||||
#if defined(EIGEN_CUDACC)
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
|
#include <cuda_runtime.h>
|
||||||
#else
|
#include <cuda_runtime_api.h>
|
||||||
#define EIGEN_CUDA_SDK_VER 0
|
#if CUDA_VERSION >= 7050
|
||||||
#endif
|
|
||||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
|
||||||
|
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||||
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||||
|
#endif
|
||||||
|
|
||||||
// To test that all calls from Eigen code to std::min() and std::max() are
|
// To test that all calls from Eigen code to std::min() and std::max() are
|
||||||
// protected by parenthesis against macro expansion, the min()/max() macros
|
// protected by parenthesis against macro expansion, the min()/max() macros
|
||||||
@ -1081,3 +1088,5 @@ int main(int argc, char *argv[])
|
|||||||
// 4503 - decorated name length exceeded, name was truncated
|
// 4503 - decorated name length exceeded, name was truncated
|
||||||
#pragma warning( disable : 4503)
|
#pragma warning( disable : 4503)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "gpu_test_helper.h"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user