From f363e533aac5aac0d67fd5728b2e5b509c756bc8 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 5 May 2016 09:05:45 -0700 Subject: [PATCH] Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. --- .../CXX11/src/Tensor/TensorContraction.h | 4 +- .../CXX11/src/Tensor/TensorContractionCuda.h | 6 +- .../src/Tensor/TensorContractionThreadPool.h | 6 +- .../test/cxx11_tensor_contract_cuda.cu | 62 +++++++++++++++++++ unsupported/test/cxx11_tensor_thread_pool.cpp | 39 ++++++++++++ 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 9d0d432ee..f8ec0614f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -143,8 +143,8 @@ struct TensorContractionEvaluatorBase static const int NumDims = LDims + RDims - 2 * ContractDims; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 6a3ef14ef..886474986 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1240,10 +1240,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes Dimensions; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 9044454fd..73c48828c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -92,10 +92,10 @@ struct TensorEvaluator right_dim_mapper_t; typedef array contract_t; - typedef array::size> left_nocontract_t; - typedef array::size> right_nocontract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; - static const int NumDims = max_n_1::size; + static const int NumDims = LDims + RDims - 2 * ContractDims; typedef DSizes Dimensions; diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index 6d1ef07f9..98ac180ef 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -84,6 +84,65 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) cudaFree((void*)d_t_result); } + +template +void test_scalar(int m_size, int k_size, int n_size) +{ + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result; + Tensor t_result_gpu; + Eigen::array dims(DimPair(0, 0), DimPair(1, 1)); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = sizeof(float); + + float* d_t_left; + float* d_t_right; + float* d_t_result; + + cudaMalloc((void**)(&d_t_left), t_left_bytes); + cudaMalloc((void**)(&d_t_right), t_right_bytes); + cudaMalloc((void**)(&d_t_result), t_result_bytes); + + cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + + Eigen::CudaStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap > + gpu_t_left(d_t_left, m_size, k_size); + Eigen::TensorMap > + gpu_t_right(d_t_right, k_size, n_size); + Eigen::TensorMap > + gpu_t_result(d_t_result); + + gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); + t_result = t_left.contract(t_right, dims); + + cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + if (fabs(t_result() - t_result_gpu()) > 1e-4f && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + std::cout << "mismatch detected: " << t_result() + << " vs " << t_result_gpu() << std::endl; + assert(false); + } + + cudaFree((void*)d_t_left); + cudaFree((void*)d_t_right); + cudaFree((void*)d_t_result); +} + + template void test_cuda_contraction_m() { for (int k = 32; k < 256; k++) { @@ -138,6 +197,9 @@ void test_cxx11_tensor_cuda() CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_scalar(128, 128, 128)); + CALL_SUBTEST_1(test_scalar(128, 128, 128)); + CALL_SUBTEST_2(test_cuda_contraction_m()); CALL_SUBTEST_3(test_cuda_contraction_m()); diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index e46197464..5fd3f0bf1 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -233,6 +233,42 @@ void test_multithread_contraction_agrees_with_singlethread() { } +template +void test_full_contraction() { + int contract_size1 = internal::random(1, 500); + int contract_size2 = internal::random(1, 500); + + Tensor left(contract_size1, + contract_size2); + Tensor right(contract_size1, + contract_size2); + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor::DimensionPair DimPair; + Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); + + Eigen::ThreadPool tp(internal::random(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(2, 11)); + + Tensor st_result; + st_result = left.contract(right, dims); + + Tensor tp_result; + tp_result.device(thread_pool_device) = left.contract(right, dims); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + // if both of the values are very small, then do nothing (because the test will fail + // due to numerical precision issues when values are small) + if (fabs(st_result() - tp_result()) >= 1e-4) { + VERIFY_IS_APPROX(st_result(), tp_result()); + } +} + template void test_multithreaded_reductions() { const int num_threads = internal::random(3, 11); @@ -324,6 +360,9 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_4(test_contraction_corner_cases()); CALL_SUBTEST_4(test_contraction_corner_cases()); + CALL_SUBTEST_4(test_full_contraction()); + CALL_SUBTEST_4(test_full_contraction()); + CALL_SUBTEST_5(test_multithreaded_reductions()); CALL_SUBTEST_5(test_multithreaded_reductions());