eigen/unsupported/test/cxx11_tensor_hip.cu
Deven Desai 8fbd47052b Adding support for using Eigen in HIP kernels.
This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs.

Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor)


Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
2018-06-06 10:12:58 -04:00

1297 lines
46 KiB
Plaintext

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_hip
#define EIGEN_USE_GPU
#ifdef __NVCC__
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#include <cuda_fp16.h>
#endif
#endif
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
void test_hip_nullary() {
Tensor<float, 1, 0, int> in1(2);
Tensor<float, 1, 0, int> in2(2);
in1.setRandom();
in2.setRandom();
std::size_t tensor_bytes = in1.size() * sizeof(float);
float* d_in1;
float* d_in2;
hipMalloc((void**)(&d_in1), tensor_bytes);
hipMalloc((void**)(&d_in2), tensor_bytes);
hipMemcpy(d_in1, in1.data(), tensor_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2.data(), tensor_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
d_in1, 2);
Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
d_in2, 2);
gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
gpu_in2.device(gpu_device) = gpu_in2.random();
Tensor<float, 1, 0, int> new1(2);
Tensor<float, 1, 0, int> new2(2);
assert(hipMemcpyAsync(new1.data(), d_in1, tensor_bytes, hipMemcpyDeviceToHost,
gpu_device.stream()) == hipSuccess);
assert(hipMemcpyAsync(new2.data(), d_in2, tensor_bytes, hipMemcpyDeviceToHost,
gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(new1(i), 3.14f);
VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
}
hipFree(d_in1);
hipFree(d_in2);
}
void test_hip_elementwise_small() {
Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>{2});
Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>{2});
Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>{2});
in1.setRandom();
in2.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_out;
hipMalloc((void**)(&d_in1), in1_bytes);
hipMalloc((void**)(&d_in2), in2_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
d_in1, Eigen::array<Eigen::DenseIndex, 1>{2});
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
d_in2, Eigen::array<Eigen::DenseIndex, 1>{2});
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
d_out, Eigen::array<Eigen::DenseIndex, 1>{2});
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(
out(Eigen::array<Eigen::DenseIndex, 1>{i}),
in1(Eigen::array<Eigen::DenseIndex, 1>{i}) + in2(Eigen::array<Eigen::DenseIndex, 1>{i}));
}
hipFree(d_in1);
hipFree(d_in2);
hipFree(d_out);
}
void test_hip_elementwise()
{
Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
in1.setRandom();
in2.setRandom();
in3.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t in3_bytes = in3.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_in3;
float* d_out;
hipMalloc((void**)(&d_in1), in1_bytes);
hipMalloc((void**)(&d_in2), in2_bytes);
hipMalloc((void**)(&d_in3), in3_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in3, in3.data(), in3_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>{72,53,97});
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) {
VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}), in1(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) + in2(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}) * in3(Eigen::array<Eigen::DenseIndex, 3>{i,j,k}));
}
}
}
hipFree(d_in1);
hipFree(d_in2);
hipFree(d_in3);
hipFree(d_out);
}
void test_hip_props() {
Tensor<float, 1> in1(200);
Tensor<bool, 1> out(200);
in1.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(bool);
float* d_in1;
bool* d_out;
hipMalloc((void**)(&d_in1), in1_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
d_in1, 200);
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
d_out, 200);
gpu_out.device(gpu_device) = (gpu_in1.isnan)();
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost,
gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 200; ++i) {
VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
}
hipFree(d_in1);
hipFree(d_out);
}
void test_hip_reduction()
{
Tensor<float, 4> in1(72,53,97,113);
Tensor<float, 2> out(72,97);
in1.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_out;
hipMalloc((void**)(&d_in1), in1_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
array<Eigen::DenseIndex, 2> reduction_axis;
reduction_axis[0] = 1;
reduction_axis[1] = 3;
gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
float expected = 0;
for (int k = 0; k < 53; ++k) {
for (int l = 0; l < 113; ++l) {
expected =
std::max<float>(expected, in1(i, k, j, l));
}
}
VERIFY_IS_APPROX(out(i,j), expected);
}
}
hipFree(d_in1);
hipFree(d_out);
}
template<int DataLayout>
void test_hip_contraction()
{
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>{3, 31, 7, 20, 1});
Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>{6, 50, 7, 20, 1});
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float* d_t_left;
float* d_t_right;
float* d_t_result;
hipMalloc((void**)(&d_t_left), t_left_bytes);
hipMalloc((void**)(&d_t_right), t_right_bytes);
hipMalloc((void**)(&d_t_result), t_result_bytes);
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
MapXf m_left(t_left.data(), 300, 93);
MapXf m_right(t_right.data(), 93, 140);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
typedef Tensor<float, 1>::DimensionPair DimPair;
Eigen::array<DimPair, 2> dims;
dims[0] = DimPair(2, 0);
dims[1] = DimPair(3, 1);
m_result = m_left * m_right;
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
hipMemcpy(t_result.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
assert(false);
}
}
hipFree(d_t_left);
hipFree(d_t_right);
hipFree(d_t_result);
}
template<int DataLayout>
void test_hip_convolution_1d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 1, DataLayout> kernel(4);
Tensor<float, 4, DataLayout> out(74,34,11,137);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
hipMalloc((void**)(&d_input), input_bytes);
hipMalloc((void**)(&d_kernel), kernel_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
Eigen::array<Eigen::DenseIndex, 1> dims{1};
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 34; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
hipFree(d_input);
hipFree(d_kernel);
hipFree(d_out);
}
void test_hip_convolution_inner_dim_col_major_1d()
{
Tensor<float, 4, ColMajor> input(74,9,11,7);
Tensor<float, 1, ColMajor> kernel(4);
Tensor<float, 4, ColMajor> out(71,9,11,7);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
hipMalloc((void**)(&d_input), input_bytes);
hipMalloc((void**)(&d_kernel), kernel_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
Eigen::array<Eigen::DenseIndex, 1> dims{0};
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 71; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 7; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
hipFree(d_input);
hipFree(d_kernel);
hipFree(d_out);
}
void test_hip_convolution_inner_dim_row_major_1d()
{
Tensor<float, 4, RowMajor> input(7,9,11,74);
Tensor<float, 1, RowMajor> kernel(4);
Tensor<float, 4, RowMajor> out(7,9,11,71);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
hipMalloc((void**)(&d_input), input_bytes);
hipMalloc((void**)(&d_kernel), kernel_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
Eigen::array<Eigen::DenseIndex, 1> dims{3};
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 7; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 71; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
hipFree(d_input);
hipFree(d_kernel);
hipFree(d_out);
}
template<int DataLayout>
void test_hip_convolution_2d()
{
Tensor<float, 4, DataLayout> input(74,37,11,137);
Tensor<float, 2, DataLayout> kernel(3,4);
Tensor<float, 4, DataLayout> out(74,35,8,137);
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
hipMalloc((void**)(&d_input), input_bytes);
hipMalloc((void**)(&d_kernel), kernel_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
Eigen::array<Eigen::DenseIndex, 2> dims{1,2};
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(i,j,k,l);
const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
input(i,j+1,k+0,l) * kernel(1,0) +
input(i,j+2,k+0,l) * kernel(2,0) +
input(i,j+0,k+1,l) * kernel(0,1) +
input(i,j+1,k+1,l) * kernel(1,1) +
input(i,j+2,k+1,l) * kernel(2,1) +
input(i,j+0,k+2,l) * kernel(0,2) +
input(i,j+1,k+2,l) * kernel(1,2) +
input(i,j+2,k+2,l) * kernel(2,2) +
input(i,j+0,k+3,l) * kernel(0,3) +
input(i,j+1,k+3,l) * kernel(1,3) +
input(i,j+2,k+3,l) * kernel(2,3);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
hipFree(d_input);
hipFree(d_kernel);
hipFree(d_out);
}
template<int DataLayout>
void test_hip_convolution_3d()
{
Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>{74,37,11,137,17});
Tensor<float, 3, DataLayout> kernel(3,4,2);
Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>{74,35,8,136,17});
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
hipMalloc((void**)(&d_input), input_bytes);
hipMalloc((void**)(&d_kernel), kernel_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_input, input.data(), input_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_kernel, kernel.data(), kernel_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
Eigen::array<Eigen::DenseIndex, 3> dims{1,2,3};
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 136; ++l) {
for (int m = 0; m < 17; ++m) {
const float result = out(i,j,k,l,m);
const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
hipFree(d_input);
hipFree(d_kernel);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_lgamma(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
in *= in.constant(stddev);
Tensor<Scalar, 2> out(72,97);
out.setZero();
std::size_t bytes = in.size() * sizeof(Scalar);
Scalar* d_in;
Scalar* d_out;
hipMalloc((void**)(&d_in), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
gpu_out.device(gpu_device) = gpu_in.lgamma();
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
}
}
hipFree(d_in);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_digamma()
{
Tensor<Scalar, 1> in(7);
Tensor<Scalar, 1> out(7);
Tensor<Scalar, 1> expected_out(7);
out.setZero();
in(0) = Scalar(1);
in(1) = Scalar(1.5);
in(2) = Scalar(4);
in(3) = Scalar(-10.5);
in(4) = Scalar(10000.5);
in(5) = Scalar(0);
in(6) = Scalar(-1);
expected_out(0) = Scalar(-0.5772156649015329);
expected_out(1) = Scalar(0.03648997397857645);
expected_out(2) = Scalar(1.2561176684318);
expected_out(3) = Scalar(2.398239129535781);
expected_out(4) = Scalar(9.210340372392849);
expected_out(5) = std::numeric_limits<Scalar>::infinity();
expected_out(6) = std::numeric_limits<Scalar>::infinity();
std::size_t bytes = in.size() * sizeof(Scalar);
Scalar* d_in;
Scalar* d_out;
hipMalloc((void**)(&d_in), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
gpu_out.device(gpu_device) = gpu_in.digamma();
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_APPROX(out(i), expected_out(i));
}
for (int i = 5; i < 7; ++i) {
VERIFY_IS_EQUAL(out(i), expected_out(i));
}
hipFree(d_in);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_zeta()
{
Tensor<Scalar, 1> in_x(6);
Tensor<Scalar, 1> in_q(6);
Tensor<Scalar, 1> out(6);
Tensor<Scalar, 1> expected_out(6);
out.setZero();
in_x(0) = Scalar(1);
in_x(1) = Scalar(1.5);
in_x(2) = Scalar(4);
in_x(3) = Scalar(-10.5);
in_x(4) = Scalar(10000.5);
in_x(5) = Scalar(3);
in_q(0) = Scalar(1.2345);
in_q(1) = Scalar(2);
in_q(2) = Scalar(1.5);
in_q(3) = Scalar(3);
in_q(4) = Scalar(1.0001);
in_q(5) = Scalar(-2.5);
expected_out(0) = std::numeric_limits<Scalar>::infinity();
expected_out(1) = Scalar(1.61237534869);
expected_out(2) = Scalar(0.234848505667);
expected_out(3) = Scalar(1.03086757337e-5);
expected_out(4) = Scalar(0.367879440865);
expected_out(5) = Scalar(0.054102025820864097);
std::size_t bytes = in_x.size() * sizeof(Scalar);
Scalar* d_in_x;
Scalar* d_in_q;
Scalar* d_out;
hipMalloc((void**)(&d_in_x), bytes);
hipMalloc((void**)(&d_in_q), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in_q, in_q.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
VERIFY_IS_EQUAL(out(0), expected_out(0));
VERIFY((std::isnan)(out(3)));
for (int i = 1; i < 6; ++i) {
if (i != 3) {
VERIFY_IS_APPROX(out(i), expected_out(i));
}
}
hipFree(d_in_x);
hipFree(d_in_q);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_polygamma()
{
Tensor<Scalar, 1> in_x(7);
Tensor<Scalar, 1> in_n(7);
Tensor<Scalar, 1> out(7);
Tensor<Scalar, 1> expected_out(7);
out.setZero();
in_n(0) = Scalar(1);
in_n(1) = Scalar(1);
in_n(2) = Scalar(1);
in_n(3) = Scalar(17);
in_n(4) = Scalar(31);
in_n(5) = Scalar(28);
in_n(6) = Scalar(8);
in_x(0) = Scalar(2);
in_x(1) = Scalar(3);
in_x(2) = Scalar(25.5);
in_x(3) = Scalar(4.7);
in_x(4) = Scalar(11.8);
in_x(5) = Scalar(17.7);
in_x(6) = Scalar(30.2);
expected_out(0) = Scalar(0.644934066848);
expected_out(1) = Scalar(0.394934066848);
expected_out(2) = Scalar(0.0399946696496);
expected_out(3) = Scalar(293.334565435);
expected_out(4) = Scalar(0.445487887616);
expected_out(5) = Scalar(-2.47810300902e-07);
expected_out(6) = Scalar(-8.29668781082e-09);
std::size_t bytes = in_x.size() * sizeof(Scalar);
Scalar* d_in_x;
Scalar* d_in_n;
Scalar* d_out;
hipMalloc((void**)(&d_in_x), bytes);
hipMalloc((void**)(&d_in_n), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in_n, in_n.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 7; ++i) {
VERIFY_IS_APPROX(out(i), expected_out(i));
}
hipFree(d_in_x);
hipFree(d_in_n);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_igamma()
{
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
Tensor<Scalar, 2> out(6, 6);
out.setZero();
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
a(i, j) = a_s[i];
x(i, j) = x_s[j];
}
}
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
{0.0, 0.6321205588285578, 0.7768698398515702,
0.9816843611112658, 9.999500016666262e-05, 1.0},
{0.0, 0.4275932955291202, 0.608374823728911,
0.9539882943107686, 7.522076445089201e-07, 1.0},
{0.0, 0.01898815687615381, 0.06564245437845008,
0.5665298796332909, 4.166333347221828e-18, 1.0},
{0.0, 0.9999780593618628, 0.9999899967080838,
0.9999996219837988, 0.9991370418689945, 1.0},
{0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
std::size_t bytes = a.size() * sizeof(Scalar);
Scalar* d_a;
Scalar* d_x;
Scalar* d_out;
assert(hipMalloc((void**)(&d_a), bytes) == hipSuccess);
assert(hipMalloc((void**)(&d_x), bytes) == hipSuccess);
assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igamma_s[i][j])) {
VERIFY((std::isnan)(out(i, j)));
} else {
VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
}
}
}
hipFree(d_a);
hipFree(d_x);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_igammac()
{
Tensor<Scalar, 2> a(6, 6);
Tensor<Scalar, 2> x(6, 6);
Tensor<Scalar, 2> out(6, 6);
out.setZero();
Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
a(i, j) = a_s[i];
x(i, j) = x_s[j];
}
}
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
{1.0, 0.36787944117144233, 0.22313016014842982,
0.018315638888734182, 0.9999000049998333, 0.0},
{1.0, 0.5724067044708798, 0.3916251762710878,
0.04601170568923136, 0.9999992477923555, 0.0},
{1.0, 0.9810118431238462, 0.9343575456215499,
0.4334701203667089, 1.0, 0.0},
{1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
3.7801620118431334e-07, 0.0008629581310054535,
0.0},
{1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
std::size_t bytes = a.size() * sizeof(Scalar);
Scalar* d_a;
Scalar* d_x;
Scalar* d_out;
hipMalloc((void**)(&d_a), bytes);
hipMalloc((void**)(&d_x), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_a, a.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_x, x.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if ((std::isnan)(igammac_s[i][j])) {
VERIFY((std::isnan)(out(i, j)));
} else {
VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
}
}
}
hipFree(d_a);
hipFree(d_x);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_erf(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
in *= in.constant(stddev);
Tensor<Scalar, 2> out(72,97);
out.setZero();
std::size_t bytes = in.size() * sizeof(Scalar);
Scalar* d_in;
Scalar* d_out;
assert(hipMalloc((void**)(&d_in), bytes) == hipSuccess);
assert(hipMalloc((void**)(&d_out), bytes) == hipSuccess);
hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
gpu_out.device(gpu_device) = gpu_in.erf();
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
}
}
hipFree(d_in);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_erfc(const Scalar stddev)
{
Tensor<Scalar, 2> in(72,97);
in.setRandom();
in *= in.constant(stddev);
Tensor<Scalar, 2> out(72,97);
out.setZero();
std::size_t bytes = in.size() * sizeof(Scalar);
Scalar* d_in;
Scalar* d_out;
hipMalloc((void**)(&d_in), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in, in.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
gpu_out.device(gpu_device) = gpu_in.erfc();
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
}
}
hipFree(d_in);
hipFree(d_out);
}
template <typename Scalar>
void test_hip_betainc()
{
Tensor<Scalar, 1> in_x(125);
Tensor<Scalar, 1> in_a(125);
Tensor<Scalar, 1> in_b(125);
Tensor<Scalar, 1> out(125);
Tensor<Scalar, 1> expected_out(125);
out.setZero();
Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
Array<Scalar, 1, Dynamic> x(125);
Array<Scalar, 1, Dynamic> a(125);
Array<Scalar, 1, Dynamic> b(125);
Array<Scalar, 1, Dynamic> v(125);
a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379,
31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
999.999, 999.999, 999.999;
x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
-0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
0.5000000000004947, 0.9999999999999999, nan;
for (int i = 0; i < 125; ++i) {
in_x(i) = x(i);
in_a(i) = a(i);
in_b(i) = b(i);
expected_out(i) = v(i);
}
std::size_t bytes = in_x.size() * sizeof(Scalar);
Scalar* d_in_x;
Scalar* d_in_a;
Scalar* d_in_b;
Scalar* d_out;
hipMalloc((void**)(&d_in_x), bytes);
hipMalloc((void**)(&d_in_a), bytes);
hipMalloc((void**)(&d_in_b), bytes);
hipMalloc((void**)(&d_out), bytes);
hipMemcpy(d_in_x, in_x.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in_a, in_a.data(), bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in_b, in_b.data(), bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
assert(hipMemcpyAsync(out.data(), d_out, bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (int i = 1; i < 125; ++i) {
if ((std::isnan)(expected_out(i))) {
VERIFY((std::isnan)(out(i)));
} else {
VERIFY_IS_APPROX(out(i), expected_out(i));
}
}
hipFree(d_in_x);
hipFree(d_in_a);
hipFree(d_in_b);
hipFree(d_out);
}
void test_cxx11_tensor_hip()
{
CALL_SUBTEST(test_hip_nullary());
CALL_SUBTEST(test_hip_elementwise_small());
CALL_SUBTEST(test_hip_elementwise());
CALL_SUBTEST(test_hip_props());
CALL_SUBTEST(test_hip_reduction());
// FIXME : uncommenting following tests results in compile failure
// CALL_SUBTEST(test_hip_contraction<ColMajor>());
// CALL_SUBTEST(test_hip_contraction<RowMajor>());
CALL_SUBTEST(test_hip_convolution_1d<ColMajor>());
CALL_SUBTEST(test_hip_convolution_1d<RowMajor>());
CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d());
CALL_SUBTEST(test_hip_convolution_inner_dim_row_major_1d());
CALL_SUBTEST(test_hip_convolution_2d<ColMajor>());
CALL_SUBTEST(test_hip_convolution_2d<RowMajor>());
// The following two tests commented out due to long runtime
// CALL_SUBTEST(test_hip_convolution_3d<ColMajor>());
// CALL_SUBTEST(test_hip_convolution_3d<RowMajor>());
#if __cplusplus > 199711L
// std::erf, std::erfc, and so on where only added in c++11. We use them
// as a golden reference to validate the results produced by Eigen. Therefore
// we can only run these tests if we use a c++11 compiler.
CALL_SUBTEST(test_hip_lgamma<float>(1.0f));
CALL_SUBTEST(test_hip_lgamma<float>(100.0f));
CALL_SUBTEST(test_hip_lgamma<float>(0.01f));
CALL_SUBTEST(test_hip_lgamma<float>(0.001f));
CALL_SUBTEST(test_hip_lgamma<double>(1.0));
CALL_SUBTEST(test_hip_lgamma<double>(100.0));
CALL_SUBTEST(test_hip_lgamma<double>(0.01));
CALL_SUBTEST(test_hip_lgamma<double>(0.001));
CALL_SUBTEST(test_hip_erf<float>(1.0f));
CALL_SUBTEST(test_hip_erf<float>(100.0f));
CALL_SUBTEST(test_hip_erf<float>(0.01f));
CALL_SUBTEST(test_hip_erf<float>(0.001f));
CALL_SUBTEST(test_hip_erfc<float>(1.0f));
// CALL_SUBTEST(test_hip_erfc<float>(100.0f)); // HIP erfc lacks precision for large inputs
CALL_SUBTEST(test_hip_erfc<float>(5.0f));
CALL_SUBTEST(test_hip_erfc<float>(0.01f));
CALL_SUBTEST(test_hip_erfc<float>(0.001f));
CALL_SUBTEST(test_hip_erf<double>(1.0));
CALL_SUBTEST(test_hip_erf<double>(100.0));
CALL_SUBTEST(test_hip_erf<double>(0.01));
CALL_SUBTEST(test_hip_erf<double>(0.001));
CALL_SUBTEST(test_hip_erfc<double>(1.0));
// CALL_SUBTEST(test_hip_erfc<double>(100.0)); // HIP erfc lacks precision for large inputs
CALL_SUBTEST(test_hip_erfc<double>(5.0));
CALL_SUBTEST(test_hip_erfc<double>(0.01));
CALL_SUBTEST(test_hip_erfc<double>(0.001));
// Following tests have functional failures on some seeds
// CALL_SUBTEST(test_hip_digamma<float>());
// CALL_SUBTEST(test_hip_digamma<double>());
// Following tests have functional failures on some seeds
// CALL_SUBTEST(test_hip_polygamma<float>());
// CALL_SUBTEST(test_hip_polygamma<double>());
// Following tests have functional failures on some seeds
// CALL_SUBTEST(test_hip_zeta<float>());
// CALL_SUBTEST(test_hip_zeta<double>());
CALL_SUBTEST(test_hip_igamma<float>());
CALL_SUBTEST(test_hip_igammac<float>());
CALL_SUBTEST(test_hip_igamma<double>());
CALL_SUBTEST(test_hip_igammac<double>());
CALL_SUBTEST(test_hip_betainc<float>());
CALL_SUBTEST(test_hip_betainc<double>());
#endif
}