From 595e8903915832278e79577633cfb9e698bdb60a Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 11 May 2016 21:27:15 -0700 Subject: [PATCH] Added more tests for half floats --- .../test/cxx11_tensor_of_float16_cuda.cu | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 0d73318a9..992bd7bd0 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -299,6 +299,18 @@ void test_cuda_reductions() { VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } + gpu_res_float.device(gpu_device) = gpu_float1.maximum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().maximum(redux_dim); + + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, size*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, size*sizeof(Eigen::half)); + gpu_device.synchronize(); + + for (int i = 0; i < size; ++i) { + std::cout << "Checking redux " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); @@ -341,6 +353,14 @@ void test_cuda_full_reductions() { VERIFY_IS_APPROX(full_prec(), half_prec()); + gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast(); + gpu_res_half.device(gpu_device) = gpu_float1.cast().maximum(); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); + gpu_device.synchronize(); + + VERIFY_IS_APPROX(full_prec(), half_prec()); + gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); gpu_device.deallocate(d_res_half); @@ -355,42 +375,49 @@ void test_cuda_forced_evals() { int num_elem = 101; float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res_half( - d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half1( + d_res_half1, num_elem); + Eigen::TensorMap, Eigen::Unaligned> gpu_res_half2( + d_res_half2, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); - gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); - gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast().abs().eval().cast(); + Eigen::array no_bcast; + no_bcast[0] = 1; - Tensor half_prec(num_elem); + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.log(); + gpu_res_half1.device(gpu_device) = gpu_float.cast().abs().eval().cast(); + gpu_res_half2.device(gpu_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().cast(); + + Tensor half_prec1(num_elem); + Tensor half_prec2(num_elem); Tensor full_prec(num_elem); - gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { - std::cout << "Checking unary " << i << std::endl; - VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + std::cout << "Checking forced eval " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec1(i)); + VERIFY_IS_APPROX(full_prec(i), half_prec2(i)); } gpu_device.deallocate(d_float); - gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_half1); + gpu_device.deallocate(d_res_half2); gpu_device.deallocate(d_res_float); } - #endif - - - void test_cxx11_tensor_of_float16_cuda() { #ifdef EIGEN_HAS_CUDA_FP16 @@ -402,7 +429,6 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_3(test_cuda_reductions()); CALL_SUBTEST_3(test_cuda_full_reductions()); CALL_SUBTEST_4(test_cuda_forced_evals()); - #else std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; #endif