// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2014 Benoit Steiner // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_USE_THREADS #include "main.h" #include #include using Eigen::Tensor; class TestAllocator : public Allocator { public: ~TestAllocator() EIGEN_OVERRIDE {} EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE { const_cast(this)->alloc_count_++; return internal::aligned_malloc(num_bytes); } EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE { const_cast(this)->dealloc_count_++; internal::aligned_free(buffer); } int alloc_count() const { return alloc_count_; } int dealloc_count() const { return dealloc_count_; } private: int alloc_count_ = 0; int dealloc_count_ = 0; }; void test_multithread_elementwise() { Tensor in1(200, 30, 70); Tensor in2(200, 30, 70); Tensor out(200, 30, 70); in1.setRandom(); in2.setRandom(); Eigen::ThreadPool tp(internal::random(3, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(3, 11)); out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast(); for (int i = 0; i < 200; ++i) { for (int j = 0; j < 30; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), static_cast(in1(i, j, k) + in2(i, j, k) * 3.14f)); } } } } void test_async_multithread_elementwise() { Tensor in1(200, 30, 70); Tensor in2(200, 30, 70); Tensor out(200, 30, 70); in1.setRandom(); in2.setRandom(); Eigen::ThreadPool tp(internal::random(3, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(3, 11)); Eigen::Barrier b(1); out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast(); b.Wait(); for (int i = 0; i < 200; ++i) { for (int j = 0; j < 30; ++j) { for (int k = 0; k < 70; ++k) { VERIFY_IS_APPROX(out(i, j, k), static_cast(in1(i, j, k) + in2(i, j, k) * 3.14f)); } } } } void test_multithread_compound_assignment() { Tensor in1(2,3,7); Tensor in2(2,3,7); Tensor out(2,3,7); in1.setRandom(); in2.setRandom(); Eigen::ThreadPool tp(internal::random(3, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(3, 11)); out.device(thread_pool_device) = in1; out.device(thread_pool_device) += in2 * 3.14f; for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); } } } } template void test_multithread_contraction() { Tensor t_left(30, 50, 37, 31); Tensor t_right(37, 31, 70, 2, 10); Tensor t_result(30, 50, 70, 2, 10); t_left.setRandom(); t_right.setRandom(); // this contraction should be equivalent to a single matrix multiplication typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 1147); MapXf m_right(t_right.data(), 1147, 1400); Matrix m_result(1500, 1400); Eigen::ThreadPool tp(4); Eigen::ThreadPoolDevice thread_pool_device(&tp, 4); // compute results by separate methods t_result.device(thread_pool_device) = t_left.contract(t_right, dims); m_result = m_left * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); if (fabsf(t_result(i) - m_result(i)) < 1e-4f) { continue; } if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) { continue; } std::cout << "mismatch detected at index " << i << ": " << t_result(i) << " vs " << m_result(i) << std::endl; assert(false); } } template void test_contraction_corner_cases() { Tensor t_left(32, 500); Tensor t_right(32, 28*28); Tensor t_result(500, 28*28); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; t_result = t_result.constant(NAN); // this contraction should be equivalent to a single matrix multiplication typedef Tensor::DimensionPair DimPair; Eigen::array dims{{DimPair(0, 0)}}; typedef Map> MapXf; MapXf m_left(t_left.data(), 32, 500); MapXf m_right(t_right.data(), 32, 28*28); Matrix m_result(500, 28*28); Eigen::ThreadPool tp(12); Eigen::ThreadPoolDevice thread_pool_device(&tp, 12); // compute results by separate methods t_result.device(thread_pool_device) = t_left.contract(t_right, dims); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } t_left.resize(32, 1); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_result.resize (1, 28*28); t_result = t_result.constant(NAN); t_result.device(thread_pool_device) = t_left.contract(t_right, dims); new(&m_left) MapXf(t_left.data(), 32, 1); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } t_left.resize(32, 500); t_right.resize(32, 4); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; t_result.resize (500, 4); t_result = t_result.constant(NAN); t_result.device(thread_pool_device) = t_left.contract(t_right, dims); new(&m_left) MapXf(t_left.data(), 32, 500); new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } t_left.resize(32, 1); t_right.resize(32, 4); t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f; t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f; t_result.resize (1, 4); t_result = t_result.constant(NAN); t_result.device(thread_pool_device) = t_left.contract(t_right, dims); new(&m_left) MapXf(t_left.data(), 32, 1); new(&m_right) MapXf(t_right.data(), 32, 4); m_result = m_left.transpose() * m_right; for (ptrdiff_t i = 0; i < t_result.size(); i++) { assert(!(numext::isnan)(t_result.data()[i])); if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { std::cout << "mismatch detected: " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl; assert(false); } } } template void test_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(1, 5000); Tensor left(internal::random(1, 80), contract_size, internal::random(1, 100)); Tensor right(internal::random(1, 25), internal::random(1, 37), contract_size, internal::random(1, 51)); left.setRandom(); right.setRandom(); // add constants to shift values away from 0 for more precision left += left.constant(1.5f); right += right.constant(1.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(1, 2)}}); Eigen::ThreadPool tp(internal::random(2, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(2, 11)); Tensor st_result; st_result = left.contract(right, dims); Tensor tp_result(st_result.dimensions()); tp_result.device(thread_pool_device) = left.contract(right, dims); VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); } } } // Apply Sqrt to all output elements. struct SqrtOutputKernel { template EIGEN_ALWAYS_INLINE void operator()( const internal::blas_data_mapper& output_mapper, const TensorContractionParams&, Index, Index, Index num_rows, Index num_cols) const { for (int i = 0; i < num_rows; ++i) { for (int j = 0; j < num_cols; ++j) { output_mapper(i, j) = std::sqrt(output_mapper(i, j)); } } } }; template static void test_multithread_contraction_with_output_kernel() { typedef Tensor::DimensionPair DimPair; const int num_threads = internal::random(2, 11); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads); Tensor t_left(30, 50, 8, 31); Tensor t_right(8, 31, 7, 20, 10); Tensor t_result(30, 50, 7, 20, 10); t_left.setRandom(); t_right.setRandom(); // Put trash in mat4 to verify contraction clears output memory. t_result.setRandom(); // Add a little offset so that the results won't be close to zero. t_left += t_left.constant(1.0f); t_right += t_right.constant(1.0f); typedef Map> MapXf; MapXf m_left(t_left.data(), 1500, 248); MapXf m_right(t_right.data(), 248, 1400); Eigen::Matrix m_result(1500, 1400); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(2, 0), DimPair(3, 1)}}); // compute results by separate methods t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); m_result = m_left * m_right; for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY(&t_result.data()[i] != &m_result.data()[i]); VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); } } template void test_async_multithread_contraction_agrees_with_singlethread() { int contract_size = internal::random(100, 500); Tensor left(internal::random(10, 40), contract_size, internal::random(10, 40)); Tensor right( internal::random(1, 20), internal::random(1, 20), contract_size, internal::random(1, 20)); left.setRandom(); right.setRandom(); // add constants to shift values away from 0 for more precision left += left.constant(1.5f); right += right.constant(1.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(1, 2)}}); Eigen::ThreadPool tp(internal::random(2, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(8, 32)); Tensor st_result; st_result = left.contract(right, dims); Tensor tp_result(st_result.dimensions()); Eigen::Barrier barrier(1); tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) = left.contract(right, dims); barrier.Wait(); VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); for (ptrdiff_t i = 0; i < st_result.size(); i++) { // if both of the values are very small, then do nothing (because the test // will fail due to numerical precision issues when values are small) if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); } } } // We are triggering 'evalShardedByInnerDim' optimization. template static void test_sharded_by_inner_dim_contraction() { typedef Tensor::DimensionPair DimPair; const int num_threads = internal::random(4, 16); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads); Tensor t_left(2, 10000); Tensor t_right(10000, 10); Tensor t_result(2, 10); t_left.setRandom(); t_right.setRandom(); // Put trash in t_result to verify contraction clears output memory. t_result.setRandom(); // Add a little offset so that the results won't be close to zero. t_left += t_left.constant(1.0f); t_right += t_right.constant(1.0f); typedef Map> MapXf; MapXf m_left(t_left.data(), 2, 10000); MapXf m_right(t_right.data(), 10000, 10); Eigen::Matrix m_result(2, 10); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(1, 0)}}); // compute results by separate methods t_result.device(device) = t_left.contract(t_right, dims); m_result = m_left * m_right; for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); } } // We are triggering 'evalShardedByInnerDim' optimization with output kernel. template static void test_sharded_by_inner_dim_contraction_with_output_kernel() { typedef Tensor::DimensionPair DimPair; const int num_threads = internal::random(4, 16); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads); Tensor t_left(2, 10000); Tensor t_right(10000, 10); Tensor t_result(2, 10); t_left.setRandom(); t_right.setRandom(); // Put trash in t_result to verify contraction clears output memory. t_result.setRandom(); // Add a little offset so that the results won't be close to zero. t_left += t_left.constant(1.0f); t_right += t_right.constant(1.0f); typedef Map> MapXf; MapXf m_left(t_left.data(), 2, 10000); MapXf m_right(t_right.data(), 10000, 10); Eigen::Matrix m_result(2, 10); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(1, 0)}}); // compute results by separate methods t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); m_result = m_left * m_right; for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); } } // We are triggering 'evalShardedByInnerDim' optimization. template static void test_async_sharded_by_inner_dim_contraction() { typedef Tensor::DimensionPair DimPair; const int num_threads = internal::random(4, 16); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads); Tensor t_left(2, 10000); Tensor t_right(10000, 10); Tensor t_result(2, 10); t_left.setRandom(); t_right.setRandom(); // Put trash in t_result to verify contraction clears output memory. t_result.setRandom(); // Add a little offset so that the results won't be close to zero. t_left += t_left.constant(1.0f); t_right += t_right.constant(1.0f); typedef Map> MapXf; MapXf m_left(t_left.data(), 2, 10000); MapXf m_right(t_right.data(), 10000, 10); Eigen::Matrix m_result(2, 10); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(1, 0)}}); // compute results by separate methods Eigen::Barrier barrier(1); t_result.device(device, [&barrier]() { barrier.Notify(); }) = t_left.contract(t_right, dims); barrier.Wait(); m_result = m_left * m_right; for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); } } // We are triggering 'evalShardedByInnerDim' optimization with output kernel. template static void test_async_sharded_by_inner_dim_contraction_with_output_kernel() { typedef Tensor::DimensionPair DimPair; const int num_threads = internal::random(4, 16); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads); Tensor t_left(2, 10000); Tensor t_right(10000, 10); Tensor t_result(2, 10); t_left.setRandom(); t_right.setRandom(); // Put trash in t_result to verify contraction clears output memory. t_result.setRandom(); // Add a little offset so that the results won't be close to zero. t_left += t_left.constant(1.0f); t_right += t_right.constant(1.0f); typedef Map> MapXf; MapXf m_left(t_left.data(), 2, 10000); MapXf m_right(t_right.data(), 10000, 10); Eigen::Matrix m_result(2, 10); // this contraction should be equivalent to a single matrix multiplication Eigen::array dims({{DimPair(1, 0)}}); // compute results by separate methods Eigen::Barrier barrier(1); t_result.device(device, [&barrier]() { barrier.Notify(); }) = t_left.contract(t_right, dims, SqrtOutputKernel()); barrier.Wait(); m_result = m_left * m_right; for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); } } template void test_full_contraction() { int contract_size1 = internal::random(1, 500); int contract_size2 = internal::random(1, 500); Tensor left(contract_size1, contract_size2); Tensor right(contract_size1, contract_size2); left.setRandom(); right.setRandom(); // add constants to shift values away from 0 for more precision left += left.constant(1.5f); right += right.constant(1.5f); typedef Tensor::DimensionPair DimPair; Eigen::array dims({{DimPair(0, 0), DimPair(1, 1)}}); Eigen::ThreadPool tp(internal::random(2, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random(2, 11)); Tensor st_result; st_result = left.contract(right, dims); Tensor tp_result; tp_result.device(thread_pool_device) = left.contract(right, dims); VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); // if both of the values are very small, then do nothing (because the test will fail // due to numerical precision issues when values are small) if (numext::abs(st_result() - tp_result()) >= 1e-4f) { VERIFY_IS_APPROX(st_result(), tp_result()); } } template void test_multithreaded_reductions() { const int num_threads = internal::random(3, 11); ThreadPool thread_pool(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads); const int num_rows = internal::random(13, 732); const int num_cols = internal::random(13, 732); Tensor t1(num_rows, num_cols); t1.setRandom(); Tensor full_redux; full_redux = t1.sum(); Tensor full_redux_tp; full_redux_tp.device(thread_pool_device) = t1.sum(); // Check that the single threaded and the multi threaded reductions return // the same result. VERIFY_IS_APPROX(full_redux(), full_redux_tp()); } void test_memcpy() { for (int i = 0; i < 5; ++i) { const int num_threads = internal::random(3, 11); Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads); const int size = internal::random(13, 7632); Tensor t1(size); t1.setRandom(); std::vector result(size); thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float)); for (int j = 0; j < size; j++) { VERIFY_IS_EQUAL(t1(j), result[j]); } } } void test_multithread_random() { Eigen::ThreadPool tp(2); Eigen::ThreadPoolDevice device(&tp, 2); Tensor t(1 << 20); t.device(device) = t.random>(); } template void test_multithread_shuffle(Allocator* allocator) { Tensor tensor(17,5,7,11); tensor.setRandom(); const int num_threads = internal::random(2, 11); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); Tensor shuffle(7,5,11,17); array shuffles = {{2,1,3,0}}; shuffle.device(device) = tensor.shuffle(shuffles); for (int i = 0; i < 17; ++i) { for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i)); } } } } } void test_threadpool_allocate(TestAllocator* allocator) { const int num_threads = internal::random(2, 11); const int num_allocs = internal::random(2, 11); ThreadPool threads(num_threads); Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); for (int a = 0; a < num_allocs; ++a) { void* ptr = device.allocate(512); device.deallocate(ptr); } VERIFY(allocator != NULL); VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs); VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs); } EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) { CALL_SUBTEST_1(test_multithread_elementwise()); CALL_SUBTEST_1(test_async_multithread_elementwise()); CALL_SUBTEST_1(test_multithread_compound_assignment()); CALL_SUBTEST_2(test_multithread_contraction()); CALL_SUBTEST_2(test_multithread_contraction()); CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread()); CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel()); CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread()); CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread()); // Test EvalShardedByInnerDimContext parallelization strategy. CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction()); CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction()); CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel()); CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel()); CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction()); CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction()); CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel()); CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel()); // Exercise various cases that have been problematic in the past. CALL_SUBTEST_7(test_contraction_corner_cases()); CALL_SUBTEST_7(test_contraction_corner_cases()); CALL_SUBTEST_8(test_full_contraction()); CALL_SUBTEST_8(test_full_contraction()); CALL_SUBTEST_9(test_multithreaded_reductions()); CALL_SUBTEST_9(test_multithreaded_reductions()); CALL_SUBTEST_10(test_memcpy()); CALL_SUBTEST_10(test_multithread_random()); TestAllocator test_allocator; CALL_SUBTEST_11(test_multithread_shuffle(NULL)); CALL_SUBTEST_11(test_multithread_shuffle(&test_allocator)); CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator)); // Force CMake to split this test. // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11 }