From 4a164d2c465e494406ccfa663eb35e2e114e5292 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Mon, 25 Apr 2016 10:43:03 -0700 Subject: [PATCH] Fixed the partial evaluation of non vectorizable tensor subexpressions --- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 2 +- .../test/cxx11_tensor_of_float16_cuda.cu | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 893542d29..5c6748a43 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -95,7 +95,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = true, + PacketAccess = TensorEvaluator::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = true diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index d2b0b30a1..1ce53ad69 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -90,7 +90,7 @@ struct TensorEvaluator, Device> enum { IsAligned = true, - PacketAccess = (internal::packet_traits::size > 1), + PacketAccess = (PacketSize > 1), Layout = TensorEvaluator::Layout, RawAccess = true }; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index cb917bb37..154a72d5c 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -228,6 +228,42 @@ void test_cuda_reductions() { gpu_device.deallocate(d_res_float); } +void test_cuda_forced_evals() { + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + int num_elem = 101; + + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); + + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_half( + d_res_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res_float( + d_res_float, num_elem); + + gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); + gpu_res_float.device(gpu_device) = gpu_float.abs(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().eval().cast(); + + Tensor half_prec(num_elem); + Tensor full_prec(num_elem); + gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); + gpu_device.synchronize(); + + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking unary " << i << std::endl; + VERIFY_IS_APPROX(full_prec(i), half_prec(i)); + } + + gpu_device.deallocate(d_float); + gpu_device.deallocate(d_res_half); + gpu_device.deallocate(d_res_float); +} #endif @@ -246,6 +282,7 @@ void test_cxx11_tensor_of_float16_cuda() CALL_SUBTEST_1(test_cuda_elementwise()); CALL_SUBTEST_2(test_cuda_contractions()); CALL_SUBTEST_3(test_cuda_reductions()); + CALL_SUBTEST_4(test_cuda_forced_evals()); } else { std::cout << "Half floats require compute capability of at least 5.3. This device only supports " << device.majorDeviceVersion() << "." << device.minorDeviceVersion() << ". Skipping the test" << std::endl;