// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include "OffByOneScalar.h" #include #include using Eigen::RowMajor; using Eigen::Tensor; // Context for evaluation on cpu struct CPUContext { CPUContext(const Eigen::Tensor& in1, Eigen::Tensor& in2, Eigen::Tensor& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2, 2), kernel_3d_(2, 2, 2) { kernel_1d_(0) = 3.14f; kernel_1d_(1) = 2.7f; kernel_2d_(0, 0) = 3.14f; kernel_2d_(1, 0) = 2.7f; kernel_2d_(0, 1) = 0.2f; kernel_2d_(1, 1) = 7.0f; kernel_3d_(0, 0, 0) = 3.14f; kernel_3d_(0, 1, 0) = 2.7f; kernel_3d_(0, 0, 1) = 0.2f; kernel_3d_(0, 1, 1) = 7.0f; kernel_3d_(1, 0, 0) = -1.0f; kernel_3d_(1, 1, 0) = -0.3f; kernel_3d_(1, 0, 1) = -0.7f; kernel_3d_(1, 1, 1) = -0.5f; } const Eigen::DefaultDevice& device() const { return cpu_device_; } const Eigen::Tensor& in1() const { return in1_; } const Eigen::Tensor& in2() const { return in2_; } Eigen::Tensor& out() { return out_; } const Eigen::Tensor& kernel1d() const { return kernel_1d_; } const Eigen::Tensor& kernel2d() const { return kernel_2d_; } const Eigen::Tensor& kernel3d() const { return kernel_3d_; } private: const Eigen::Tensor& in1_; const Eigen::Tensor& in2_; Eigen::Tensor& out_; Eigen::Tensor kernel_1d_; Eigen::Tensor kernel_2d_; Eigen::Tensor kernel_3d_; Eigen::DefaultDevice cpu_device_; }; // Context for evaluation on GPU struct GPUContext { GPUContext(const Eigen::TensorMap>& in1, Eigen::TensorMap>& in2, Eigen::TensorMap>& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { assert(gpuMalloc((void**)(&kernel_1d_), 2 * sizeof(float)) == gpuSuccess); float kernel_1d_val[] = {3.14f, 2.7f}; assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2 * sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); assert(gpuMalloc((void**)(&kernel_2d_), 4 * sizeof(float)) == gpuSuccess); float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4 * sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); assert(gpuMalloc((void**)(&kernel_3d_), 8 * sizeof(float)) == gpuSuccess); float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8 * sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); } ~GPUContext() { assert(gpuFree(kernel_1d_) == gpuSuccess); assert(gpuFree(kernel_2d_) == gpuSuccess); assert(gpuFree(kernel_3d_) == gpuSuccess); } const Eigen::GpuDevice& device() const { return gpu_device_; } const Eigen::TensorMap>& in1() const { return in1_; } const Eigen::TensorMap>& in2() const { return in2_; } Eigen::TensorMap>& out() { return out_; } Eigen::TensorMap> kernel1d() const { return Eigen::TensorMap>(kernel_1d_, 2); } Eigen::TensorMap> kernel2d() const { return Eigen::TensorMap>(kernel_2d_, 2, 2); } Eigen::TensorMap> kernel3d() const { return Eigen::TensorMap>(kernel_3d_, 2, 2, 2); } private: const Eigen::TensorMap>& in1_; const Eigen::TensorMap>& in2_; Eigen::TensorMap>& out_; float* kernel_1d_; float* kernel_2d_; float* kernel_3d_; Eigen::GpuStreamDevice stream_; Eigen::GpuDevice gpu_device_; }; // The actual expression to evaluate template void test_contextual_eval(Context* context) { context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f); } template void test_forced_contextual_eval(Context* context) { context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f); } template void test_compound_assignment(Context* context) { context->out().device(context->device()) = context->in1().constant(2.718f); context->out().device(context->device()) += context->in1() + context->in2() * 3.14f; } template void test_contraction(Context* context) { Eigen::array, 2> dims; dims[0] = std::make_pair(1, 1); dims[1] = std::make_pair(2, 2); Eigen::array shape{40, 50 * 70}; Eigen::DSizes indices(0, 0); Eigen::DSizes sizes(40, 40); context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims); } template void test_1d_convolution(Context* context) { Eigen::DSizes indices(0, 0, 0); Eigen::DSizes sizes(40, 49, 70); Eigen::array dims{1}; context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims); } template void test_2d_convolution(Context* context) { Eigen::DSizes indices(0, 0, 0); Eigen::DSizes sizes(40, 49, 69); Eigen::array dims{1, 2}; context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims); } template void test_3d_convolution(Context* context) { Eigen::DSizes indices(0, 0, 0); Eigen::DSizes sizes(39, 49, 69); Eigen::array dims{0, 1, 2}; context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims); } // Helper method to synchronize device. template void synchronize(Device& device) { /*nothing*/ } template <> void synchronize(Eigen::GpuDevice& device) { device.synchronize(); } template void test_device_memory(const TensorDevice& device) { int count = 100; Eigen::array tensorRange{count}; Eigen::Tensor host(tensorRange); Eigen::Tensor expected(tensorRange); DataType* device_data = static_cast(device.allocate(count * sizeof(DataType))); // memset const char byte_value = static_cast(0xAB); device.memset(device_data, byte_value, count * sizeof(DataType)); device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType)); synchronize(device); memset(expected.data(), byte_value, count * sizeof(DataType)); for (size_t i = 0; i < count; i++) { VERIFY_IS_EQUAL(host(i), expected(i)); } // fill DataType fill_value = DataType(7); std::fill_n(expected.data(), count, fill_value); device.fill(device_data, device_data + count, fill_value); device.memcpyDeviceToHost(host.data(), device_data, count * sizeof(DataType)); synchronize(device); for (int i = 0; i < count; i++) { VERIFY_IS_EQUAL(host(i), expected(i)); } device.deallocate(device_data); } void test_cpu() { Eigen::Tensor in1(40, 50, 70); Eigen::Tensor in2(40, 50, 70); Eigen::Tensor out(40, 50, 70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); CPUContext context(in1, in2, out); test_contextual_eval(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * 3.14f + 2.718f); } } } test_compound_assignment(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f + 2.718f); } } } test_contraction(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { const float result = out(i, j, 0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); } } test_1d_convolution(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f)); } } } test_2d_convolution(&context); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { const float result = out(i, j, k); const float expected = (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f) + (in1(i, j, k + 1) * 0.2f + in1(i, j + 1, k + 1) * 7.0f); if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); } } } test_3d_convolution(&context); for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { const float result = out(i, j, k); const float expected = (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f + in1(i, j, k + 1) * 0.2f + in1(i, j + 1, k + 1) * 7.0f) + (in1(i + 1, j, k) * -1.0f + in1(i + 1, j + 1, k) * -0.3f + in1(i + 1, j, k + 1) * -0.7f + in1(i + 1, j + 1, k + 1) * -0.5f); if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) { continue; } VERIFY_IS_APPROX(expected, result); } } } test_device_memory(context.device()); test_device_memory>(context.device()); } void test_gpu() { Eigen::Tensor in1(40, 50, 70); Eigen::Tensor in2(40, 50, 70); Eigen::Tensor out(40, 50, 70); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); std::size_t in1_bytes = in1.size() * sizeof(float); std::size_t in2_bytes = in2.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float); float* d_in1; float* d_in2; float* d_out; gpuMalloc((void**)(&d_in1), in1_bytes); gpuMalloc((void**)(&d_in2), in2_bytes); gpuMalloc((void**)(&d_out), out_bytes); gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); Eigen::TensorMap> gpu_in1(d_in1, 40, 50, 70); Eigen::TensorMap> gpu_in2(d_in2, 40, 50, 70); Eigen::TensorMap> gpu_out(d_out, 40, 50, 70); GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f + 2.718f); } } } test_forced_contextual_eval(&context); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * 3.14f + 2.718f); } } } test_compound_assignment(&context); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k) * 3.14f + 2.718f); } } } test_contraction(&context); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { const float result = out(i, j, 0); float expected = 0; for (int k = 0; k < 50; ++k) { for (int l = 0; l < 70; ++l) { expected += in1(i, k, l) * in2(j, k, l); } } VERIFY_IS_APPROX(expected, result); } } test_1d_convolution(&context); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f)); } } } test_2d_convolution(&context); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { const float result = out(i, j, k); const float expected = (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f + in1(i, j, k + 1) * 0.2f + in1(i, j + 1, k + 1) * 7.0f); VERIFY_IS_APPROX(expected, result); } } } #if !defined(EIGEN_USE_HIP) // disable this test on the HIP platform // 3D tensor convolutions seem to hang on the HIP platform test_3d_convolution(&context); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { const float result = out(i, j, k); const float expected = (in1(i, j, k) * 3.14f + in1(i, j + 1, k) * 2.7f + in1(i, j, k + 1) * 0.2f + in1(i, j + 1, k + 1) * 7.0f + in1(i + 1, j, k) * -1.0f + in1(i + 1, j + 1, k) * -0.3f + in1(i + 1, j, k + 1) * -0.7f + in1(i + 1, j + 1, k + 1) * -0.5f); VERIFY_IS_APPROX(expected, result); } } } #endif test_device_memory(context.device()); test_device_memory>(context.device()); } EIGEN_DECLARE_TEST(cxx11_tensor_device) { CALL_SUBTEST_1(test_cpu()); CALL_SUBTEST_2(test_gpu()); }