// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2018 Eugene Zhulenev // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_USE_THREADS #include "main.h" #include using Eigen::ColMajor; using Eigen::RowMajor; using Eigen::Tensor; using Eigen::internal::TiledEvaluation; // A set of tests to verify that different TensorExecutor strategies yields the // same results for all the ops, supporting tiled evaluation. // Default assignment that does no use block evaluation or vectorization. // We assume that default coefficient evaluation is well tested and correct. template void DefaultAssign(Dst& dst, Expr expr) { using Assign = Eigen::TensorAssignOp; using Executor = Eigen::internal::TensorExecutor; Executor::run(Assign(dst, expr), DefaultDevice()); } // Assignment with specified device and tiling strategy. template void DeviceAssign(Device& d, Dst& dst, Expr expr) { using Assign = Eigen::TensorAssignOp; using Executor = Eigen::internal::TensorExecutor; Executor::run(Assign(dst, expr), d); } template static array RandomDims(int min_dim = 1, int max_dim = 20) { array dims; for (int i = 0; i < NumDims; ++i) { dims[i] = internal::random(min_dim, max_dim); } return dims; } template void test_execute_unary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. auto dims = RandomDims(50 / NumDims, 100 / NumDims); Tensor src(dims); Tensor dst(dims); src.setRandom(); const auto expr = src.square(); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { T square = src.coeff(i) * src.coeff(i); VERIFY_IS_EQUAL(square, dst.coeff(i)); } } template void test_execute_binary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. auto dims = RandomDims(50 / NumDims, 100 / NumDims); Tensor lhs(dims); Tensor rhs(dims); Tensor dst(dims); lhs.setRandom(); rhs.setRandom(); const auto expr = lhs + rhs; using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { T sum = lhs.coeff(i) + rhs.coeff(i); VERIFY_IS_EQUAL(sum, dst.coeff(i)); } } template void test_execute_broadcasting(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(1, 10); Tensor src(dims); src.setRandom(); const auto broadcasts = RandomDims(1, 7); const auto expr = src.broadcast(broadcasts); // We assume that broadcasting on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden; golden = expr; // Now do the broadcasting using configured tensor executor. Tensor dst(golden.dimensions()); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template struct test_execute_chipping_rvalue_runner { template static std::enable_if_t<0 <= ChipDim, void> run_dim(Device& d, const array& dims, const Tensor& src) { const auto offset = internal::random(0, dims[(ChipDim)] - 1); const auto expr = src.template chip(offset); Tensor golden; golden = expr; Tensor dst(golden.dimensions()); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } // Recursively reduce chip dimension. run_dim(d, dims, src); } template static std::enable_if_t < ChipDim<0, void> run_dim(Device&, const array&, const Tensor&) {} static void run(Device d) { auto dims = RandomDims(1, 10); Tensor src(dims); src.setRandom(); run_dim(d, dims, src); } }; template void test_execute_chipping_rvalue(Device d) { test_execute_chipping_rvalue_runner::run(d); } template struct test_execute_chipping_lvalue_runner { template static std::enable_if_t<0 <= ChipDim> run_dim(Device& d, const array& dims) { /* Generate random data that we'll assign to the chipped tensor dim. */ array src_dims; for (int i = 0; i < NumDims - 1; ++i) { int dim = i < (ChipDim) ? i : i + 1; src_dims[i] = dims[dim]; } Tensor src(src_dims); src.setRandom(); const auto offset = internal::random(0, dims[(ChipDim)] - 1); Tensor random(dims); random.setZero(); Tensor golden(dims); golden = random; golden.template chip<(ChipDim)>(offset) = src; Tensor dst(dims); dst = random; auto expr = dst.template chip<(ChipDim)>(offset); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(expr, src), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } run_dim(d, dims); } template static std::enable_if_t < ChipDim<0, void> run_dim(Device&, const array&) {} static void run(Device d) { auto dims = RandomDims(1, 10); run_dim(d, dims); } }; template void test_execute_chipping_lvalue(Device d) { test_execute_chipping_lvalue_runner::run(d); } template void test_execute_shuffle_rvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(1, 10); Tensor src(dims); src.setRandom(); DSizes shuffle; for (int i = 0; i < NumDims; ++i) shuffle[i] = i; // Test all possible shuffle permutations. do { DSizes shuffled_dims; for (int i = 0; i < NumDims; ++i) { shuffled_dims[i] = dims[shuffle[i]]; } const auto expr = src.shuffle(shuffle); // We assume that shuffling on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden(shuffled_dims); DefaultAssign(golden, expr); // Now do the shuffling using configured tensor executor. Tensor dst(shuffled_dims); DeviceAssign(d, dst, expr); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } template void test_execute_shuffle_lvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); Tensor src(dims); src.setRandom(); DSizes shuffle; for (int i = 0; i < NumDims; ++i) shuffle[i] = i; // Test all possible shuffle permutations. do { DSizes shuffled_dims; for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; // We assume that shuffling on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden(shuffled_dims); auto golden_shuffle = golden.shuffle(shuffle); DefaultAssign(golden_shuffle, src); // Now do the shuffling using configured tensor executor. Tensor dst(shuffled_dims); auto dst_shuffle = dst.shuffle(shuffle); DeviceAssign(d, dst_shuffle, src); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); } template void test_execute_reshape(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int ReshapedDims = NumDims - 1; static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); Tensor src(dims); src.setRandom(); // Multiple 0th dimension and then shuffle. std::vector shuffle; for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i); std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); DSizes reshaped_dims; reshaped_dims[shuffle[0]] = dims[0] * dims[1]; for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1]; Tensor golden = src.reshape(reshaped_dims); // Now reshape using configured tensor executor. Tensor dst(golden.dimensions()); auto expr = src.reshape(reshaped_dims); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template void test_execute_slice_rvalue(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); Tensor src(dims); src.setRandom(); // Pick a random slice of src tensor. auto slice_start = DSizes(RandomDims()); auto slice_size = DSizes(RandomDims()); // Make sure that slice start + size do not overflow tensor dims. for (int i = 0; i < NumDims; ++i) { slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); } Tensor golden = src.slice(slice_start, slice_size); // Now reshape using configured tensor executor. Tensor dst(golden.dimensions()); auto expr = src.slice(slice_start, slice_size); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template void test_execute_slice_lvalue(Device d) { static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); static constexpr int Options = 0 | Layout; auto dims = RandomDims(5, 10); Tensor src(dims); src.setRandom(); // Pick a random slice of src tensor. auto slice_start = DSizes(RandomDims(1, 10)); auto slice_size = DSizes(RandomDims(1, 10)); // Make sure that slice start + size do not overflow tensor dims. for (int i = 0; i < NumDims; ++i) { slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); } Tensor slice(slice_size); slice.setRandom(); // Assign a slice using default executor. Tensor golden = src; golden.slice(slice_start, slice_size) = slice; // And using configured execution strategy. Tensor dst = src; auto expr = dst.slice(slice_start, slice_size); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(expr, slice), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template void test_execute_broadcasting_of_forced_eval(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(1, 10); Tensor src(dims); src.setRandom(); const auto broadcasts = RandomDims(1, 7); const auto expr = src.square().eval().broadcast(broadcasts); // We assume that broadcasting on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden; golden = expr; // Now do the broadcasting using configured tensor executor. Tensor dst(golden.dimensions()); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template struct DummyGenerator { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T operator()(const array& dims) const { T result = static_cast(0); for (int i = 0; i < NumDims; ++i) { result += static_cast((i + 1) * dims[i]); } return result; } }; template void test_execute_generator_op(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(20, 30); Tensor src(dims); src.setRandom(); const auto expr = src.generate(DummyGenerator()); // We assume that generator on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden; golden = expr; // Now do the broadcasting using configured tensor executor. Tensor dst(golden.dimensions()); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template void test_execute_reverse_rvalue(Device d) { static constexpr int Options = 0 | Layout; auto dims = RandomDims(1, numext::pow(1000000.0, 1.0 / NumDims)); Tensor src(dims); src.setRandom(); // Reverse half of the dimensions. Eigen::array reverse; for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random(); const auto expr = src.reverse(reverse); // We assume that reversing on a default device is tested and correct, so // we can rely on it to verify correctness of tensor executor and tiling. Tensor golden; golden = expr; // Now do the reversing using configured tensor executor. Tensor dst(golden.dimensions()); using Assign = TensorAssignOp; using Executor = internal::TensorExecutor; Executor::run(Assign(dst, expr), d); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); } } template void test_async_execute_unary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. auto dims = RandomDims(50 / NumDims, 100 / NumDims); Tensor src(dims); Tensor dst(dims); src.setRandom(); const auto expr = src.square(); Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait(); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { T square = src.coeff(i) * src.coeff(i); VERIFY_IS_EQUAL(square, dst.coeff(i)); } } template void test_async_execute_binary_expr(Device d) { static constexpr int Options = 0 | Layout; // Pick a large enough tensor size to bypass small tensor block evaluation // optimization. auto dims = RandomDims(50 / NumDims, 100 / NumDims); Tensor lhs(dims); Tensor rhs(dims); Tensor dst(dims); lhs.setRandom(); rhs.setRandom(); const auto expr = lhs + rhs; Eigen::Barrier done(1); auto on_done = [&done]() { done.Notify(); }; using Assign = TensorAssignOp; using DoneCallback = decltype(on_done); using Executor = internal::TensorAsyncExecutor; Executor::runAsync(Assign(dst, expr), d, on_done); done.Wait(); for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { T sum = lhs.coeff(i) + rhs.coeff(i); VERIFY_IS_EQUAL(sum, dst.coeff(i)); } } #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE 0 #endif #define VECTORIZABLE(T, VAL) !EIGEN_DONT_VECTORIZE&& Eigen::internal::packet_traits::Vectorizable&& VAL #define CALL_SUBTEST_PART(PART) CALL_SUBTEST_##PART #define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(default_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(default_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(default_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(default_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))) // NOTE: Currently only ThreadPoolDevice supports async expression evaluation. #define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART)((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))); \ CALL_SUBTEST_PART(PART) \ ((NAME(tp_device))) EIGEN_DECLARE_TEST(cxx11_tensor_executor) { Eigen::DefaultDevice default_device; // Default device is unused in ASYNC tests. EIGEN_UNUSED_VARIABLE(default_device); const auto num_threads = internal::random(20, 24); Eigen::ThreadPool tp(num_threads); Eigen::ThreadPoolDevice tp_device(&tp, num_threads); CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3); CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4); CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5); CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3); CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4); CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5); CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3); CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4); CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5); CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3); CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4); CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5); CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3); CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4); CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5); CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3); CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4); CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5); CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3); CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4); CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5); CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2); CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3); CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4); CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5); CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2); CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3); CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4); CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5); CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2); CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3); CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4); CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5); CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3); CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4); CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5); // Force CMake to split this test. // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16 }