From 8b1c2108ba210e39a181ebe9548760bd17474a0a Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 22 Dec 2016 16:45:38 +0000 Subject: [PATCH 01/54] Reverting asynchronous exec to Synchronous exec regarding random race condition. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 16bbbf894..268d9d148 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -307,9 +307,12 @@ struct SyclDevice { sycl_queue().wait_and_throw(); //pass } - EIGEN_STRONG_INLINE void asynchronousExec() const { - sycl_queue().throw_asynchronous();//pass - } + EIGEN_STRONG_INLINE void asynchronousExec() const { + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled + sycl_queue().wait_and_throw(); //pass + + } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { From 3eda02d78d7ab367730fb50f5a669725966a53b9 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 22 Dec 2016 10:37:05 -0800 Subject: [PATCH 02/54] Fixed the sycl benchmarking code --- bench/tensors/tensor_benchmarks_sycl.cc | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 7eca4d966..6df190869 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -5,29 +5,12 @@ #include "tensor_benchmarks.h" -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; -// Simple functions -template -cl::sycl::queue sycl_queue() { - return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - }); -} - #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - cl::sycl::queue q = sycl_queue(); \ - Eigen::SyclDevice device(q); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ BenchmarkSuite suite(device, N); \ suite.FUNC(iters); \ } \ From 90c5bc8d64146ff704b9d61f612919b351e15071 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Wed, 4 Jan 2017 22:18:44 +0000 Subject: [PATCH 03/54] Fixes auto appearance in functor template argument for reduction. --- unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c9912d9d4..319417687 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -74,8 +74,8 @@ struct FullReducer { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - typedef decltype(functors) FunctorExpr; + typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -136,8 +136,8 @@ struct InnerReducer { static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - auto functors = TensorSycl::internal::extractFunctors(self.impl()); - typedef decltype(functors) FunctorExpr; + typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); typename Self::Index range, GRange, tileSize; typedef typename Eigen::internal::remove_all::type Dims; From 8245d3c7adc973e0ba2461dc1a09a6b66aff55ef Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 12 Jan 2017 12:13:18 +0000 Subject: [PATCH 04/54] Fix case-sensitivity of file include --- Eigen/Geometry | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Geometry b/Eigen/Geometry index 716d52952..9520d9a63 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -44,7 +44,7 @@ #include "src/Geometry/Transform.h" #include "src/Geometry/Translation.h" #include "src/Geometry/Scaling.h" -#include "src/Geometry/Hyperplane.h" +#include "src/Geometry/HyperPlane.h" #include "src/Geometry/ParametrizedLine.h" #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" From 23778a15d8570b4287820f540b719203e07cfb44 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Mon, 16 Jan 2017 11:05:56 +0000 Subject: [PATCH 05/54] Reverting unintentional change to Eigen/Geometry --- Eigen/Geometry | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/Geometry b/Eigen/Geometry index 9520d9a63..716d52952 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -44,7 +44,7 @@ #include "src/Geometry/Transform.h" #include "src/Geometry/Translation.h" #include "src/Geometry/Scaling.h" -#include "src/Geometry/HyperPlane.h" +#include "src/Geometry/Hyperplane.h" #include "src/Geometry/ParametrizedLine.h" #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" From e46e7223817cfd982edec6d8e25c77e8e2493d78 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 16 Jan 2017 13:58:49 +0000 Subject: [PATCH 06/54] Adding Tensor ReverseOp; TensorStriding; TensorConversionOp; Modifying Tensor Contractsycl to be located in any place in the expression tree. --- Eigen/Geometry | 1 - .../CXX11/src/Tensor/TensorContraction.h | 5 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 131 ++++++----- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 3 + .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 37 ++- .../src/Tensor/TensorForwardDeclarations.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 2 + .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 25 +- .../CXX11/src/Tensor/TensorReductionSycl.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 5 + .../Eigen/CXX11/src/Tensor/TensorStriding.h | 20 +- .../TensorSyclConvertToDeviceExpression.h | 14 +- .../src/Tensor/TensorSyclExprConstructor.h | 52 ++++- .../src/Tensor/TensorSyclExtractAccessor.h | 17 ++ .../src/Tensor/TensorSyclExtractFunctors.h | 35 ++- .../CXX11/src/Tensor/TensorSyclLeafCount.h | 15 ++ .../src/Tensor/TensorSyclPlaceHolderExpr.h | 14 ++ .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 28 ++- unsupported/test/CMakeLists.txt | 2 + .../test/cxx11_tensor_contract_sycl.cpp | 73 +++++- .../test/cxx11_tensor_reverse_sycl.cpp | 221 ++++++++++++++++++ .../test/cxx11_tensor_striding_sycl.cpp | 203 ++++++++++++++++ unsupported/test/cxx11_tensor_sycl.cpp | 32 +++ 23 files changed, 827 insertions(+), 124 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_reverse_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_striding_sycl.cpp diff --git a/Eigen/Geometry b/Eigen/Geometry index 716d52952..131a4edfc 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -59,4 +59,3 @@ #endif // EIGEN_GEOMETRY_MODULE_H /* vim: set filetype=cpp et sw=2 ts=2 ai: */ - diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 2ac6abf69..1b8017349 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -156,7 +156,7 @@ struct TensorContractionEvaluatorBase m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.rhsExpression(), op.lhsExpression()), device), m_device(device), - m_result(NULL), m_expr_indices(op.indices()) { + m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -564,9 +564,6 @@ struct TensorContractionEvaluatorBase TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; - /// required for sycl - const Indices m_expr_indices; - }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index b170a1a5c..dc16f89e0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -146,9 +146,9 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); + LaunchSyclKernels::Run(*this, buffer, m, n, k, + this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, + this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); } // required by sycl to construct the expr on the device. Returns original left_impl const TensorEvaluator& left_impl() const { @@ -158,47 +158,18 @@ struct TensorEvaluator& right_impl() const { return choose(Cond(Layout) == static_cast(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); } - // required by sycl to construct the expr on the device - const Indices& indices() const {return this->m_expr_indices;} }; -/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster. -template struct TensorEvaluatorContainer; -template -struct TensorEvaluatorContainer>{ - typedef Eigen::DefaultDevice Device; - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - enum { - Layout = TensorEvaluator::Layout, - }; - - typedef typename internal::conditional(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device){} -LeftEvaluator m_leftImpl; -RightEvaluator m_rightImpl; -}; - - -template struct KernelConstructor{ - - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; - - FunctorExpr functors; +int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ + typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type LHSPlaceHolderExpr; + typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type RHSPlaceHolderExpr; + LHSFunctorExpr lhs_functors; + RHSFunctorExpr rhs_functors; LhsLocalAcc localLhs; RhsLocalAcc localRhs; OutAccessor out_res; @@ -206,38 +177,50 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; LeftNocontractT m_i_strides, m_left_nocontract_strides; RightNocontractT m_j_strides, m_right_nocontract_strides; - TupleType tuple_of_accessors; + LHSTupleType left_tuple_of_accessors; + RHSTupleType right_tuple_of_accessors; + Device dev; - KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, + + KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_) - :functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), + LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) + :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), m_right_contracting_strides(m_right_contracting_strides_), m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - tuple_of_accessors(tuple_of_accessors_){} + left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = TensorEvaluatorContainer(device_expr.expr, Eigen::DefaultDevice()); - typedef TensorEvaluatorContainer DevEvaluator; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type LHSDevExpr; + typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type RHSDevExpr; + auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(lhs_functors, left_tuple_of_accessors); + auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(rhs_functors, right_tuple_of_accessors); + typedef decltype(lhs_dev_expr.expr) LeftArgType; + typedef decltype(rhs_dev_expr.expr) RightArgType; + typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; typedef internal::TensorContractionInputMapper LhsMapper; typedef internal::TensorContractionInputMapper RhsMapper; // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); + LhsMapper lhs(LeftEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), + lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); + RhsMapper rhs(RightEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), + rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); // Matmul Kernel // Thread identifiers @@ -327,7 +310,6 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr firstHalf++; } while (firstHalf::Type PlaceHolderExpr; - // typedef KernelNameConstructor KernelName; - auto functors = Eigen::TensorSycl::internal::extractFunctors(self); - typedef decltype(functors) FunctorExpr; + typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; + typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; + typedef TensorEvaluator OrigLHSExpr; + typedef TensorEvaluator OrigRHSExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor LHSFunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor RHSFunctorExpr; + // extract lhs functor list + LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + // extract rhs functor list + RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); + Index roundUpK = RoundUp(K, TileSizeDimK); Index roundUpM = RoundUp(M, TileSizeDimM); Index roundUpN = RoundUp(N, TileSizeDimN); + self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self); - typedef decltype(tuple_of_accessors) TupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl())) LHSTupleType; + /// work-around for gcc bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl())) RHSTupleType; + // create lhs tuple of accessors + LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl()); + // create rhs tuple of accessors + RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl()); + // Local memory for elements of Lhs typedef cl::sycl::accessor LhsLocalAcc; LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); // Local memory for elements of Rhs typedef cl::sycl::accessor RhsLocalAcc; RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); + + typedef cl::sycl::accessor OutAccessor; //OutScalar memory - auto out_res= self.device(). template get_sycl_accessor(cgh, buffer); - typedef decltype(out_res) OutAccessor; + OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); + // sycl parallel for cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor(functors, + WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors)); + m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); }); self.device().asynchronousExec(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 860a6949a..b29968b63 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -246,6 +246,9 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } + protected: template struct PacketConv { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 930837021..822e22c2d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,8 +26,8 @@ namespace Eigen { /// Therefore, by adding the default value, we managed to convert the type and it does not break any /// existing code as its default value is T*. namespace internal { -template class MakePointer_> -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -42,33 +42,26 @@ struct traits > enum { Flags = 0 }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - - }; }; -template class MakePointer_> -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template class MakePointer_> -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase class MakePointer_> -struct TensorEvaluator, Device> +template +struct TensorEvaluator, Device> { - typedef TensorForcedEvalOp XprType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -150,7 +143,7 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor const TensorEvaluator& impl() { return m_impl; } @@ -160,7 +153,7 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; const ArgType m_op; const Device& m_device; - typename MakePointer::Type m_buffer; + CoeffReturnType* m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 9a012c176..2e638992a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -75,7 +75,7 @@ template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class MakePointer_ = MakePointer> class TensorForcedEvalOp; +template class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 485a082e2..ef1c9c42c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -205,6 +205,8 @@ class TensorIntDivisor { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); +#elif defined(__SYCL_DEVICE_ONLY__) + return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index d582ccbe1..dbe11c7af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -711,6 +711,12 @@ struct TensorEvaluator XprType; static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -730,12 +736,22 @@ struct TensorEvaluator::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ + #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + #else + startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); + stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); + #endif }else{ - /* implies m_strides[i]<0 by assert */ + /* implies m_strides[i]<0 by assert */ + #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + #else + startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); + stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); + #endif } m_startIndices[i] = startIndicesClamped[i]; } @@ -796,13 +812,6 @@ struct TensorEvaluator::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 319417687..82ca71215 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -74,7 +74,7 @@ struct FullReducer { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); @@ -108,9 +108,10 @@ struct FullReducer { // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is a work around for gcc bug + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef decltype(tuple_of_accessors) TupleType; + TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); typedef decltype(tmp_global_accessor) OutAccessor; cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), @@ -136,7 +137,7 @@ struct InnerReducer { static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); typename Self::Index range, GRange, tileSize; typedef typename Eigen::internal::remove_all::type Dims; @@ -147,9 +148,10 @@ struct InnerReducer { /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { + // this is work around for gcc bug. + typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator - auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - typedef typename Eigen::internal::remove_all::type Tuple_of_Acc; + Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(cgh, output); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 14e392e36..e430b0826 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -224,6 +224,11 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from sycl device + ReverseDimensions functor() const { return m_reverse; } + protected: Dimensions m_dimensions; array m_strides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 6c35bfdb6..93615e5c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -117,11 +117,15 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { +#ifndef __SYCL_DEVICE_ONLY__ m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); +#else + m_dimensions[i] = cl::sycl::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); +#endif } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -224,6 +228,13 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// required by sycl in order to extract the accessor + Strides functor() const { return m_strides; } + + + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -250,6 +261,7 @@ struct TensorEvaluator, Device> array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; + const Strides m_strides; }; @@ -286,6 +298,12 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return this->m_impl; } + /// required by sycl in order to extract the accessor + Strides functor() const { return this->m_strides; } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 113dd2557..29f362ade 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -97,8 +97,18 @@ template \ struct ConvertToDeviceExpression > \ : DeviceConvertor{}; -KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) -KERNELBROKERCONVERT(, false, TensorForcedEvalOp) +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ +template \ +struct ConvertToDeviceExpression > {\ + typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression::Type> Type;\ +}; +KERNELBROKERCONVERTFORCEDEVAL(const) +KERNELBROKERCONVERTFORCEDEVAL() +#undef KERNELBROKERCONVERTFORCEDEVAL + + + KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index df1a732e7..56ba82805 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -188,6 +188,28 @@ struct ExprConstructor, CVQual ASSIGN(const) ASSIGN() #undef ASSIGN + + + + + /// specialisation of the \ref ExprConstructor struct when the node type is + /// const TensorAssignOp + #define CONVERSIONEXPRCONST(CVQual)\ + template \ + struct ExprConstructor, CVQual TensorConversionOp, Params...> {\ + typedef ExprConstructor my_nested_type;\ + typedef CVQual TensorConversionOp Type;\ + my_nested_type nestedExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ + }; + + CONVERSIONEXPRCONST(const) + CONVERSIONEXPRCONST() + #undef CONVERSIONEXPRCONST + /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorEvalToOp /// 0 here is the output number in the buffer #define EVALTO(CVQual)\ @@ -212,10 +234,10 @@ EVALTO() /// TensorForcedEvalOp #define FORCEDEVAL(CVQual)\ template \ -struct ExprConstructor,\ +struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - typedef CVQual TensorMap::Scalar,\ - TensorForcedEvalOp::NumDimensions, Eigen::internal::traits>::Layout, typename TensorForcedEvalOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef CVQual TensorMap::Scalar,\ + TensorForcedEvalOp::NumDimensions, Eigen::internal::traits>::Layout, typename TensorForcedEvalOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ @@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorContractionOp +#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ +template \ +struct ExprConstructor,\ +CVQual PlaceHolder, N>, Params...> {\ + static const size_t NumIndices= Eigen::internal::traits >::NumDimensions;\ + typedef CVQual TensorMap::Scalar,\ + NumIndices, Eigen::internal::traits >::Layout,\ + typename ExprNode::Index>,\ + Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ +}; + +SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTION + + #define SYCLSLICEOPEXPR(CVQual)\ template\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 876fcd45e..e4658eda5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -194,6 +194,23 @@ SYCLREDUCTIONEXTACC(const) SYCLREDUCTIONEXTACC() #undef SYCLREDUCTIONEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp +#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ +template\ + struct ExtractAccessor, Dev> > {\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ + return AccessorConstructor::template getAccessor(cgh, eval);\ + }\ +}; + +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONEXTACC + + /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorSlicingOp. This is a special case where there is no OP #define SYCLSLICEOPEXTACC(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 6f9ab57af..e26cbdf6d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -42,6 +42,20 @@ template struct FunctorExtractor{ }; +/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything +///TensorConversionOp +#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ +template \ +struct FunctorExtractor, Dev> > {\ + FunctorExtractor > subExpr;\ + FunctorExtractor(const TensorEvaluator, Dev>& expr)\ + : subExpr(expr.impl()) {}\ +}; + +SYCLEXTRFUNCCONVERSION(TensorConversionOp, const) +SYCLEXTRFUNCCONVERSION(TensorConversionOp, ) +#undef SYCLEXTRFUNCCONVERSION + #define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\ template class MakePointer_, typename Dev>\ struct FunctorExtractor< TensorEvaluator , Options_, MakePointer_> , Dev> >{\ @@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const) SYCLEXTRFUNCREDUCTIONOP() #undef SYCLEXTRFUNCREDUCTIONOP +#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ +template\ +struct FunctorExtractor, Device>>{\ + typedef TensorEvaluator, Device> Evaluator;\ + typedef typename Evaluator::Dimensions Dimensions;\ + const Dimensions m_dimensions;\ + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ + FunctorExtractor(const TensorEvaluator, Device>& expr)\ + : m_dimensions(expr.dimensions()) {}\ +}; + + +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) +SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) +#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP + /// specialisation of the \ref FunctorExtractor struct when the node type is /// const TensorSlicingOp. This is an specialisation without OP so it has to be separated. #define SYCLEXTRFUNCTSLICEOP(CVQual)\ @@ -253,9 +285,6 @@ struct FunctorExtractor, : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ }; -// TensorContractionOp -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const) -SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),) // TensorConcatenationOp SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 37fe196ea..0ac51e7bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const) REDUCTIONLEAFCOUNT() #undef REDUCTIONLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp +#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\ +template \ +struct LeafCount > {\ + static const size_t Count =1;\ +}; + +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) +CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) +#undef CONTRACTIONCONVOLUTIONLEAFCOUNT + + + /// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp #define SLICEOPLEAFCOUNT(CVQual)\ template \ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 4419a1780..f6e3b4766 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -168,6 +168,20 @@ SYCLREDUCTION() #undef SYCLREDUCTION +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorReductionOp +#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ +template \ +struct PlaceHolderExpression, N>{\ + typedef CVQual PlaceHolder, N> Type;\ +}; +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp) +SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp) +#undef SYCLCONTRACTIONCONVOLUTIONPLH + + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorCwiseSelectOp #define SLICEOPEXPR(CVQual)\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 32930be26..6ce41b0ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -49,19 +49,39 @@ template struct ExecEx /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. +/// std::array does not have TotalSize. So I have to get the size throgh template specialisation. +template struct DimensionSize{ + static Index getDimSize(const Dimensions& dim){ + return dim.TotalSize(); + + } +}; +#define DIMSIZEMACRO(CVQual)\ +template struct DimensionSize>{\ + static inline Index getDimSize(const std::array& dim){\ + return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\ + }\ +}; + +DIMSIZEMACRO(const) +DIMSIZEMACRO() +#undef DIMSIZEMACRO + + template void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr; + typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = internal::extractFunctors(evaluator); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator - typedef decltype(internal::createTupleOfAccessors(cgh, evaluator)) TupleType; - TupleType tuple_of_accessors = internal::createTupleOfAccessors(cgh, evaluator); + typedef decltype(internal::createTupleOfAccessors >(cgh, evaluator)) TupleType; + TupleType tuple_of_accessors = internal::createTupleOfAccessors >(cgh, evaluator); typename Expr::Index range, GRange, tileSize; - dev.parallel_for_setup(static_cast(evaluator.dimensions().TotalSize()), tileSize, range, GRange); + typename Expr::Index total_size = static_cast(DimensionSize::Dimensions>::getDimSize(evaluator.dimensions())); + dev.parallel_for_setup(total_size, tileSize, range, GRange); cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), ExecExprFunctorKernel(range diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index daedb671c..cbbd3efb4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -152,6 +152,8 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 0221da110..5dacc87f2 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -65,10 +65,9 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - t_result = t_left.contract(t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); for (DenseIndex i = 0; i < t_result.size(); i++) { if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { @@ -86,6 +85,69 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.deallocate(d_t_result); } +template +void test_TF(const Device& sycl_device) +{ + Eigen::array left_dims = {{2, 3}}; + Eigen::array right_dims = {{3, 1}}; + Eigen::array res_dims = {{2, 1}}; + Eigen::array dims = {{DimPair(1, 0)}}; + + + Tensor t_left(left_dims); + Tensor t_right(right_dims); + Tensor t_result_gpu(res_dims); + Tensor t_result(res_dims); + + t_left.data()[0] = 1.0f; + t_left.data()[1] = 2.0f; + t_left.data()[2] = 3.0f; + t_left.data()[3] = 4.0f; + t_left.data()[4] = 5.0f; + t_left.data()[5] = 6.0f; + + t_right.data()[0] = -1.0f; + t_right.data()[1] = 0.5f; + t_right.data()[2] = 2.0f; + + std::size_t t_left_bytes = t_left.size() * sizeof(float); + std::size_t t_right_bytes = t_right.size() * sizeof(float); + std::size_t t_result_bytes = t_result.size()*sizeof(float); + + + float * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); + float * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); + float * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap > gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (DenseIndex i = 0; i < t_result.size(); i++) { + if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + continue; + } + std::cout << "mismatch detected at index " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); + + +} template void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) @@ -121,9 +183,10 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + t_result = t_left.contract(t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); if (static_cast(fabs(t_result() - t_result_gpu())) > 1e-4f && !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { std::cout << "mismatch detected: " << t_result() @@ -204,6 +267,9 @@ template void tensorContractionPerDevice(Dev_selector& s test_sycl_contraction_k(sycl_device); test_sycl_contraction_sizes(sycl_device); test_sycl_contraction_sizes(sycl_device); + test_TF(sycl_device); + test_TF(sycl_device); + end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); @@ -211,6 +277,7 @@ template void tensorContractionPerDevice(Dev_selector& s << "elapsed time: " << elapsed_seconds.count() << "s\n"; } + void test_cxx11_tensor_contract_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorContractionPerDevice(device)); diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp new file mode 100644 index 000000000..73b394c18 --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -0,0 +1,221 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include "main.h" +#include + + +template +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { + + int dim1 = 2; + int dim2 = 3; + int dim3 = 5; + int dim4 = 7; + + array tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor tensor(tensorRange); + Tensor reversed_tensor(tensorRange); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data =static_cast(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + } + } + } + } + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + } + } + } + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + + + +template +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) +{ + int dim1 = 2; + int dim2 = 3; + int dim3 = 5; + int dim4 = 7; + + array tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor tensor(tensorRange); + Tensor expected(tensorRange); + Tensor result(tensorRange); + tensor.setRandom(); + + array dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_expected =static_cast(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data_result =static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu_expected(gpu_out_data_expected, tensorRange); + TensorMap > out_gpu_result(gpu_out_data_result, tensorRange); + + + sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + + if (LValue) { + out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; + } else { + out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); + } + sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); + + + array src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array dst_slice_dim = src_slice_dim; + array dst_slice_start = src_slice_start; + + for (int i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + for (int i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = + in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + + for (int i = 0; i < expected.dimension(0); ++i) { + for (int j = 0; j < expected.dimension(1); ++j) { + for (int k = 0; k < expected.dimension(2); ++k) { + for (int l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + } + } + } + } +} + + + +template void sycl_reverse_test_per_device(const cl::sycl::device& d){ + std::cout << "Running on " << d.template get_info() << std::endl; + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_reverse(sycl_device); + test_simple_reverse(sycl_device); + test_expr_reverse(sycl_device, false); + test_expr_reverse(sycl_device, false); + test_expr_reverse(sycl_device, true); + test_expr_reverse(sycl_device, true); +} +void test_cxx11_tensor_reverse_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_reverse_test_per_device(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp new file mode 100644 index 000000000..2cbb18f1c --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -0,0 +1,203 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include +#include +#include + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template +static void test_simple_striding(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array tensor_dims = {{2,3,5,7}}; + Eigen::array stride_dims = {{1,1,3,3}}; + + + Tensor tensor(tensor_dims); + Tensor no_stride(tensor_dims); + Tensor stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + DataType * d_tensor = static_cast(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap > gpu_no_stride(d_no_stride, tensor_dims); + Eigen::TensorMap > gpu_stride(d_stride, stride_dims); + + + tensor.setRandom(); + array strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + //no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; +//Tensor stride; +// stride = tensor.stride(strides); + + gpu_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (int i = 0; i < 1; ++i) { + for (int j = 0; j < 1; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } + + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + +template +static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array tensor_dims = {{2,3,5,7}}; + Eigen::array stride_dims = {{3,12,10,21}}; + + + Tensor tensor(tensor_dims); + Tensor no_stride(stride_dims); + Tensor stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + + DataType * d_tensor = static_cast(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap > gpu_no_stride(d_no_stride, stride_dims); + Eigen::TensorMap > gpu_stride(d_stride, stride_dims); + + //Tensor tensor(2,3,5,7); + tensor.setRandom(); + array strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + +// Tensor result(3, 12, 10, 21); +// result.stride(strides) = tensor; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); + } + } + } + } + + array no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; +// Tensor result2(3, 12, 10, 21); +// result2.stride(strides) = tensor.stride(no_strides); + + gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); + } + } + } + } + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + + +template void tensorStridingPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_simple_striding(sycl_device); + test_simple_striding(sycl_device); + test_striding_as_lvalue(sycl_device); + test_striding_as_lvalue(sycl_device); +} + +void test_cxx11_tensor_striding_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorStridingPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index d5c0cbaad..5992a306d 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -229,6 +229,36 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } +template +static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ + int size = 20; + array tensorRange = {{size}}; + Tensor in(tensorRange); + Tensor out(tensorRange); + Tensor out_host(tensorRange); + + in = in.random(); + + Scalar1* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(Scalar1))); + Scalar2 * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(Scalar2))); + + + + + TensorMap> gpu_in(gpu_in_data, tensorRange); + TensorMap> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); + gpu_out.device(sycl_device) = gpu_in. template cast(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); + out_host = in. template cast(); + for(int i=0; i< size; i++) + { + VERIFY_IS_APPROX(out(i), out_host(i)); + } + printf("cast Test Passed\n"); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template void sycl_computing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); @@ -238,6 +268,8 @@ template void sycl_computing_test_per_ test_sycl_mem_transfers(sycl_device); test_sycl_computations(sycl_device); test_sycl_mem_sync(sycl_device); + test_sycl_cast(sycl_device); + test_sycl_cast(sycl_device); } void test_cxx11_tensor_sycl() { From c6f7b338343ead9617558857c91fd3e03e347c3f Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 18 Jan 2017 10:45:28 +0000 Subject: [PATCH 07/54] Applying Benoit's comment. Embedding synchronisation inside device memcpy so there is no need to externally call synchronise() for device memcopy. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 6 +++--- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 6 +++--- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 ++++---------- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 14 +++----------- 4 files changed, 13 insertions(+), 27 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 268d9d148..8f8d1caad 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -234,7 +234,7 @@ struct SyclDevice { auto dst_acc =it2->second.template get_access(cgh); cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); }); - asynchronousExec(); + synchronize(); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -265,7 +265,7 @@ struct SyclDevice { auto dst_acc =dest_buf.template get_access(cgh); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); }); - asynchronousExec(); + synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -308,7 +308,7 @@ struct SyclDevice { } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled sycl_queue().wait_and_throw(); //pass diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 822e22c2d..abe85c860 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -143,12 +143,12 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const ArgType m_op; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index dbe11c7af..6ddd2ca18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -736,22 +736,12 @@ struct TensorEvaluator::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); - #endif }else{ /* implies m_strides[i]<0 by assert */ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); - #endif } m_startIndices[i] = startIndicesClamped[i]; } @@ -867,7 +857,11 @@ struct TensorEvaluator m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 93615e5c2..2237140e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -121,11 +121,7 @@ struct TensorEvaluator, Device> { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { -#ifndef __SYCL_DEVICE_ONLY__ - m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); -#else - m_dimensions[i] = cl::sycl::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); -#endif + m_dimensions[i] =Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -233,8 +229,6 @@ struct TensorEvaluator, Device> /// required by sycl in order to extract the accessor Strides functor() const { return m_strides; } - - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -264,7 +258,6 @@ struct TensorEvaluator, Device> const Strides m_strides; }; - // Eval as lvalue template struct TensorEvaluator, Device> @@ -299,10 +292,9 @@ struct TensorEvaluator, Device> } /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return this->m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return this->m_impl; } /// required by sycl in order to extract the accessor - Strides functor() const { return this->m_strides; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) From 6bdd15f572c0b8cd21f5acba3671d536f50a9b53 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 19 Jan 2017 11:30:59 +0000 Subject: [PATCH 08/54] Adding non-deferrenciable pointer track for ComputeCpp backend; Adding TensorConvolutionOp for ComputeCpp; fixing typos. modifying TensorDeviceSycl to use the LegacyPointer class. --- .../CXX11/src/Tensor/TensorConvolution.h | 2 +- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 476 ++++++++++++++++++ .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 212 +++++--- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 14 +- .../CXX11/src/Tensor/TensorReductionSycl.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 3 + .../TensorSyclConvertToDeviceExpression.h | 2 +- .../src/Tensor/TensorSyclExtractAccessor.h | 75 +-- .../src/Tensor/TensorSyclLegacyPointer.h | 244 +++++++++ .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 2 +- unsupported/test/CMakeLists.txt | 1 + .../test/cxx11_tensor_contract_sycl.cpp | 13 +- .../test/cxx11_tensor_convolution_sycl.cpp | 469 +++++++++++++++++ unsupported/test/cxx11_tensor_sycl.cpp | 3 - 16 files changed, 1390 insertions(+), 149 deletions(-) create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h create mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h create mode 100644 unsupported/test/cxx11_tensor_convolution_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index abdf742c6..378f5cccb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { + if (static_cast(i + 1) < offset) { m_cudaInputStrides[i] = m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_cudaOutputStrides[i] = diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 000000000..7774342d8 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,476 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +template +struct EigenConvolutionKernel1D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; +internal::IndexMapper::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize, range_x, range_y; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel1D(internal::IndexMapper::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<2> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; + const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); + /// fill the shared memory + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset ; + const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); + if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const int first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset+ itemID.get_local(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + +template +struct EigenConvolutionKernel2D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; +internal::IndexMapper::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel2D(internal::IndexMapper::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); + const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; + + /// fill the shared memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); + for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); + if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ + local_acc[local_index] = device_evaluator.coeff(tensor_index); + } + else local_acc[local_index]=0.0f; + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x + const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y + if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ + CoeffReturnType result = static_cast(0); + for (size_t j = 0; j < kernelSize_y; j++) { + size_t kernel_offset =kernelSize_x * j; + const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); + for (size_t i = 0; i < kernelSize_x; i++) { + result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + + + +template +struct EigenConvolutionKernel3D{ +typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; +internal::IndexMapper::Layout> indexMapper; +Kernel_accessor kernel_filter; +const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; +Buffer_accessor buffer_acc; +Local_accessor local_acc; +FunctorExpr functors; +TupleType tuple_of_accessors; +EigenConvolutionKernel3D(internal::IndexMapper::Layout> indexMapper_, + Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , + const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, + Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) + :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), + kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), + buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} + + void operator()(cl::sycl::nd_item<3> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); + + auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); + auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); + const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory + const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory + const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; + const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; + const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; + for(size_t p=0; p(0); + for (size_t k = 0; k < kernelSize_z; k++) { + for (size_t j = 0; j < kernelSize_y; j++) { + for (size_t i = 0; i < kernelSize_x; i++) { + const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); + const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) + +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + + +template +struct TensorEvaluator, const Eigen::SyclDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + + typedef Eigen::TensorSycl::internal::FunctorExtractor InputFunctorExpr; + // extract input functor list + InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); + + const unsigned long maxSharedMem = m_device.sharedMemPerBlock(); // sycl localmemory size + m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { + + typedef cl::sycl::accessor InputLocalAcc; + /// work-around for gcc 4.8 auto bug + typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl)) InputTupleType; + // create input tuple of accessors + InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl); + + typedef cl::sycl::accessor OutputAccessorType; + OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); + typedef cl::sycl::accessor KernelAccessorType; + KernelAccessorType kernel_acc= m_device. template get_sycl_accessor(cgh, m_kernel); + + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; + m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); + const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); + assert(static_cast(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range + auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array indices{m_indices[0]}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), + EigenConvolutionKernel1D( + indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 2: { + const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numP = dimensions().TotalSize() / (numX*numY); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; + assert(static_cast(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + const array indices {{m_indices[idxX], m_indices[idxY]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel2D( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + case 3: { + const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const size_t idxZ =static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + const size_t numX = dimensions()[m_indices[idxX]]; + const size_t numY = dimensions()[m_indices[idxY]]; + const size_t numZ = dimensions()[m_indices[idxZ]]; + const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); + const array indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; + m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); + const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); + assert(static_cast(shared_mem) <= maxSharedMem); + auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range + auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range + InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); + cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), + EigenConvolutionKernel3D( + indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, + numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + }); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice& m_device; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 268d9d148..ae8a9f667 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,16 +15,16 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include "TensorSyclLegacyPointer.h" + namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) - template class MemCopyFunctor { + template class MemCopyFunctor { public: - typedef cl::sycl::accessor read_accessor; - typedef cl::sycl::accessor write_accessor; - - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) + : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} void operator()(cl::sycl::nd_item<1> itemID) { auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); @@ -55,6 +55,7 @@ namespace Eigen { }; + EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); @@ -77,11 +78,10 @@ struct QueueInterface { bool exception_caught_ = false; mutable std::mutex mutex_; - /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; + //mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename @@ -119,49 +119,42 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access().get_pointer(); - buf.set_final_data(nullptr); std::lock_guard lock(mutex_); - buffer_map.insert(std::pair>(static_cast(ptr),buf)); - return static_cast(ptr); + return codeplay::legacy::malloc(num_bytes); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard lock(mutex_); - auto it = buffer_map.find(static_cast(p)); - if (it != buffer_map.end()) { - auto num_bytes =it->second.get_size(); - buffer_map.erase(it); - // Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version - std::allocator a1; // Default allocator for buffer - a1.deallocate(static_cast(p), num_bytes); - } + return codeplay::legacy::free(p); } EIGEN_STRONG_INLINE void deallocate_all() const { std::lock_guard lock(mutex_); - buffer_map.clear(); + codeplay::legacy::clear(); } - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { + EIGEN_STRONG_INLINE codeplay::legacy::PointerMapper& pointerMapper() const { std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl; - abort(); + return codeplay::legacy::getPointerMapper(); } + EIGEN_STRONG_INLINE cl::sycl::buffer get_buffer(void* ptr) const { + std::lock_guard lock(mutex_); + return pointerMapper().get_buffer(pointerMapper().get_buffer_id(ptr)); + } + + EIGEN_STRONG_INLINE size_t get_buffer_offset(void* ptr) const { + std::lock_guard lock(mutex_); + return pointerMapper().get_offset(ptr); + } + + /*EIGEN_STRONG_INLINE void* get_buffer_id(void* ptr) const { + std::lock_guard lock(mutex_); + return static_cast(pointerMapper().get_buffer_id(ptr)); + }*/ + // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -172,7 +165,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } // destructor - ~QueueInterface() { buffer_map.clear(); } + ~QueueInterface() { codeplay::legacy::clear(); } }; struct SyclDevice { @@ -190,14 +183,20 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer(const void * ptr) const { + return m_queue_stream->get_buffer(const_cast(ptr)); } + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(sycl_queue().get_device(). template get_info()/2); + tileSize =static_cast(sycl_queue().get_device(). template get_info()); + auto s= sycl_queue().get_device().template get_info(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + tileSize=std::min(static_cast(256), static_cast(tileSize)); + } rng = n; if (rng==0) rng=static_cast(1); GRange=rng; @@ -207,6 +206,76 @@ struct SyclDevice { if (xMode != 0) GRange += static_cast(tileSize - xMode); } } + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + size_t pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/tileSize1); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + + + /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { + Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); + if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + } + size_t pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + rng2=dim2; + if (rng2==0 ) rng1=static_cast(1); + GRange2=rng2; + if (tileSize2>GRange2) tileSize2=GRange2; + else if(GRange2>tileSize2){ + Index xMode = static_cast(GRange2 % tileSize2); + if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); + } + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + rng1=dim1; + if (rng1==0 ) rng1=static_cast(1); + GRange1=rng1; + if (tileSize1>GRange1) tileSize1=GRange1; + else if(GRange1>tileSize1){ + Index xMode = static_cast(GRange1 % tileSize1); + if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); + } + tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); + rng0 = dim0; + if (rng0==0 ) rng0=static_cast(1); + GRange0=rng0; + if (tileSize0>GRange0) tileSize0=GRange0; + else if(GRange0>tileSize0){ + Index xMode = static_cast(GRange0 % tileSize0); + if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); + } + } + + /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -220,21 +289,21 @@ struct SyclDevice { EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer((void*)src); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(T); - i/=sizeof(T); + template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + auto offset= m_queue_stream->get_buffer_offset((void*)src); + auto i= m_queue_stream->get_buffer_offset(dst); + offset/=sizeof(Index); + i/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); + auto src_acc =get_sycl_accessor(cgh, src); + auto dst_acc =get_sycl_accessor(cgh, dst); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); }); - asynchronousExec(); + synchronize(); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -246,26 +315,28 @@ struct SyclDevice { auto host_acc= get_sycl_buffer(dst). template get_access(); ::memcpy(host_acc.get_pointer(), src, n); } + /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(T); + template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { + auto offset =m_queue_stream->get_buffer_offset((void *)src); + offset/=sizeof(Index); size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(T), tileSize, rng, GRange); + parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); + auto src_acc= get_sycl_accessor(cgh, src); auto dst_acc =dest_buf.template get_access(cgh); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); + typedef decltype(src_acc) read_accessor; + typedef decltype(dst_acc) write_accessor; + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); }); - asynchronousExec(); + synchronize(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -273,8 +344,9 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); - asynchronousExec(); + auto buf =get_sycl_buffer(static_cast(static_cast(data))); + sycl_queue().submit(memsetCghFunctor(buf,rng, GRange, tileSize, c )); + synchronize(); } struct memsetCghFunctor{ @@ -300,6 +372,24 @@ struct SyclDevice { // there is no l3 cache on cuda devices. return firstLevelCacheSize(); } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return sycl_queue().get_device(). template get_info(); + // return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return sycl_queue().get_device(). template get_info(); + + // return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2;//sycl_queue().get_device(). template get_info(); + // return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return sycl_queue().get_device(). template get_info(); + // return stream_->deviceProperties().sharedMemPerBlock; + } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } @@ -308,7 +398,7 @@ struct SyclDevice { } EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. + ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled sycl_queue().wait_and_throw(); //pass diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 822e22c2d..abe85c860 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -143,12 +143,12 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const ArgType m_op; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index dbe11c7af..6ddd2ca18 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -736,22 +736,12 @@ struct TensorEvaluator::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); if(m_strides[i]>0){ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(0), static_cast(m_impl.dimensions()[i])); - #endif }else{ /* implies m_strides[i]<0 by assert */ - #ifndef __SYCL_DEVICE_ONLY__ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - #else - startIndicesClamped[i] = cl::sycl::clamp(static_cast(op.startIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); - stopIndicesClamped[i] = cl::sycl::clamp(static_cast(op.stopIndices()[i]), static_cast(-1), static_cast(m_impl.dimensions()[i] - 1)); - #endif } m_startIndices[i] = startIndicesClamped[i]; } @@ -867,7 +857,11 @@ struct TensorEvaluator m_outputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 82ca71215..8ecef59a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -108,7 +108,7 @@ struct FullReducer { // Dims dims= self.xprDims(); //Op functor = reducer; dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a work around for gcc bug + // this is a workaround for gcc 4.8 bug typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; // create a tuple of accessors from Evaluator TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); @@ -148,7 +148,7 @@ struct InnerReducer { /// recursively apply reduction on it in order to reduce the whole. dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is work around for gcc bug. + // this is workaround for gcc 4.8 bug. typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 93615e5c2..e846257a9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -121,11 +121,7 @@ struct TensorEvaluator, Device> { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { -#ifndef __SYCL_DEVICE_ONLY__ - m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); -#else - m_dimensions[i] = cl::sycl::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); -#endif + m_dimensions[i] =Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -233,8 +229,6 @@ struct TensorEvaluator, Device> /// required by sycl in order to extract the accessor Strides functor() const { return m_strides; } - - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -299,10 +293,9 @@ struct TensorEvaluator, Device> } /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return this->m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return this->m_impl; } /// required by sycl in order to extract the accessor - Strides functor() const { return this->m_strides; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 2e61ee049..84f660597 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -80,6 +80,9 @@ template struct GetType{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" +/// this is used for extracting tensor convolution +#include "TensorConvolutionSycl.h" + // kernel execution using fusion #include "TensorSyclRun.h" //sycl functors diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index 29f362ade..c0bcf26cd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -97,7 +97,7 @@ template \ struct ConvertToDeviceExpression > \ : DeviceConvertor{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp #define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ template \ struct ConvertToDeviceExpression > {\ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index e4658eda5..3fd607941 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -35,6 +35,8 @@ namespace Eigen { namespace TensorSycl { namespace internal { +#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} + /// \struct ExtractAccessor: Extract Accessor Class is used to extract the /// accessor from a buffer. /// Depending on the type of the leaf node we can get a read accessor or a @@ -44,22 +46,16 @@ struct ExtractAccessor; struct AccessorConstructor{ template static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(ExtractAccessor::getTuple(cgh, eval)) { - return ExtractAccessor::getTuple(cgh, eval); - } + RETURN_CPP11(ExtractAccessor::getTuple(cgh, eval)) template static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - -> decltype(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2))) { - return utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2)); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2))) + template static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - -> decltype(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3)))) { - return utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3))); - } + RETURN_CPP11(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3)))) + template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor(cgh,eval.data()))){ - return utility::tuple::make_tuple(eval.device().template get_sycl_accessor(cgh,eval.data())); - } + RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor(cgh,eval.data()))) }; /// specialisation of the \ref ExtractAccessor struct when the node type is @@ -68,9 +64,7 @@ struct AccessorConstructor{ template class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ +RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLUNARYCATEGORYEXTACC(const) @@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC() template class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLBINARYCATEGORYEXTACC(const) @@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC() template class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\ - return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ }; SYCLTERNARYCATEGORYEXTACC(const) @@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC() template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ }; SYCLSELECTOPEXTACC(const) @@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC() template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\ - return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ }; SYCLTENSORASSIGNOPEXTACC(const) @@ -142,9 +128,7 @@ struct ExtractAccessor, template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ - return AccessorConstructor::template getAccessor(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; TENSORMAPEXPR(const, cl::sycl::access::mode::read) @@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write) template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ - return AccessorConstructor::template getAccessor(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; SYCLFORCEDEVALEXTACC(const) @@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC() template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev>& eval)\ - -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\ - return utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\ - }\ + RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ }; SYCLEVALTOEXTACC(const) @@ -185,23 +165,19 @@ SYCLEVALTOEXTACC() template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ - return AccessorConstructor::template getAccessor(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; SYCLREDUCTIONEXTACC(const) SYCLREDUCTIONEXTACC() #undef SYCLREDUCTIONEXTACC -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp #define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ template\ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ - return AccessorConstructor::template getAccessor(cgh, eval);\ - }\ + RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ }; SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) @@ -212,27 +188,24 @@ SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) /// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. This is a special case where there is no OP +/// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICEOPEXTACC(const) SYCLSLICEOPEXTACC() #undef SYCLSLICEOPEXTACC - +// specialisation of the \ref ExtractAccessor struct when the node type is +/// const TensorStridingSlicingOp. #define SYCLSLICESTRIDEOPEXTACC(CVQual)\ template\ struct ExtractAccessor, Dev> >{\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ - return AccessorConstructor::getTuple(cgh, eval.impl());\ - }\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ }; SYCLSLICESTRIDEOPEXTACC(const) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h new file mode 100644 index 000000000..b723592cd --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h @@ -0,0 +1,244 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Ruyman Reyes Codeplay Software Ltd +// Mehdi Goli Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorSyclLegacyPointer.h + * + * \brief: + * Interface for SYCL buffers to behave as a non-deferrenciable pointer + * This can be found in Codeplay's ComputeCpp SDK : legacy_pointer.h + * + **************************************************************************/ + +namespace codeplay { +namespace legacy { + +/** + * PointerMapper + * Associates fake pointers with buffers. + * + */ +class PointerMapper { + public: + /* pointer information definitions + */ + static const unsigned long ADDRESS_BITS = sizeof(void *) * 8; + static const unsigned long BUFFER_ID_BITSIZE = 16u; + static const unsigned long MAX_NUMBER_BUFFERS = (1UL << BUFFER_ID_BITSIZE)-1; + static const unsigned long MAX_OFFSET = (1UL << (ADDRESS_BITS - BUFFER_ID_BITSIZE))-1; + + using base_ptr_t = uintptr_t; + + /* Fake Pointers are constructed using an integer indexing plus + * the offset: + * + * |== MAX_BUFFERS ==|======== MAX_OFFSET ========| + * | Buffer Id | Offset in buffer | + * |=================|============================| + */ + struct legacy_pointer_t { + /* Type for the pointers + */ + base_ptr_t _contents; + + /** Conversions from legacy_pointer_t to + * the void * should just reinterpret_cast the integer + * number + */ + operator void *() const { return reinterpret_cast(_contents); } + + /** + * Convert back to the integer number. + */ + operator base_ptr_t() const { return _contents; } + + /** + * Converts a void * into a legacy pointer structure. + * Note that this will only work if the void * was + * already a legacy_pointer_t, but we have no way of + * checking + */ + legacy_pointer_t(void *ptr) + : _contents(reinterpret_cast(ptr)){}; + + /** + * Creates a legacy_pointer_t from the given integer + * number + */ + legacy_pointer_t(base_ptr_t u) : _contents(u){}; + }; + + /* Whether if a pointer is null or not. + * + * A pointer is nullptr if the buffer id is 0, + * i.e the first BUFFER_ID_BITSIZE are zero + */ + static inline bool is_nullptr(legacy_pointer_t ptr) { + return ((MAX_OFFSET & ptr) == ptr); + } + + /* Base nullptr + */ + const legacy_pointer_t null_legacy_ptr = nullptr; + + /* Data type to create buffer of byte-size elements + */ + using buffer_data_type = uint8_t; + + /* basic type for all buffers + */ + using buffer_t = cl::sycl::buffer; + + /* id of a buffer in the map + */ + typedef short buffer_id; + + /* get_buffer_id + */ + inline buffer_id get_buffer_id(legacy_pointer_t ptr) const { + return ptr >> (ADDRESS_BITS - BUFFER_ID_BITSIZE); + } + + /* + * get_buffer_offset + */ + inline off_t get_offset(legacy_pointer_t ptr) const { + return ptr & MAX_OFFSET;; + } + + /** + * Constructs the PointerMapper structure. + */ + PointerMapper() + : __pointer_list{}, rng_(std::random_device()()), uni_(1, 256){}; + + /** + * PointerMapper cannot be copied or moved + */ + PointerMapper(const PointerMapper &) = delete; + + /** + * empty the pointer list + */ + inline void clear() { + __pointer_list.clear(); + } + + /* generate_id + * Generates a unique id for a buffer. + */ + buffer_id generate_id() { + // Limit the number of attempts to half the combinations + // just to avoid an infinite loop + int numberOfAttempts = 1ul << (BUFFER_ID_BITSIZE / 2); + buffer_id bId; + do { + bId = uni_(rng_); + } while (__pointer_list.find(bId) != __pointer_list.end() && + numberOfAttempts--); + return bId; + } + + /* add_pointer. + * Adds a pointer to the map and returns the fake pointer id. + * This will be the bufferId on the most significant bytes and 0 elsewhere. + */ + legacy_pointer_t add_pointer(buffer_t &&b) { + auto nextNumber = __pointer_list.size(); + buffer_id bId = generate_id(); + __pointer_list.emplace(bId, b); + if (nextNumber > MAX_NUMBER_BUFFERS) { + return null_legacy_ptr; + } + base_ptr_t retVal = bId; + retVal <<= (ADDRESS_BITS - BUFFER_ID_BITSIZE); + return retVal; + } + + /* get_buffer. + * Returns a buffer from the map using the buffer id + */ + buffer_t get_buffer(buffer_id bId) const { + auto it = __pointer_list.find(bId); + if (it != __pointer_list.end()) + return it->second; + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); + } + + /* remove_pointer. + * Removes the given pointer from the map. + */ + void remove_pointer(void *ptr) { + buffer_id bId = this->get_buffer_id(ptr); + __pointer_list.erase(bId); + } + + /* count. + * Return the number of active pointers (i.e, pointers that + * have been malloc but not freed). + */ + size_t count() const { return __pointer_list.size(); } + + private: + /* Maps the buffer id numbers to the actual buffer + * instances. + */ + std::map __pointer_list; + + /* Random number generator for the buffer ids + */ + std::mt19937 rng_; + + /* Random-number engine + */ + std::uniform_int_distribution uni_; +}; + +/** + * Singleton interface to the pointer mapper to implement + * the generic malloc/free C interface without extra + * parameters. + */ +inline PointerMapper &getPointerMapper() { + static PointerMapper thePointerMapper; + return thePointerMapper; +} + +/** + * Malloc-like interface to the pointer-mapper. + * Given a size, creates a byte-typed buffer and returns a + * fake pointer to keep track of it. + */ +inline void *malloc(size_t size) { + // Create a generic buffer of the given size + auto thePointer = getPointerMapper().add_pointer( + PointerMapper::buffer_t(cl::sycl::range<1>{size})); + // Store the buffer on the global list + return static_cast(thePointer); +} + +/** + * Free-like interface to the pointer mapper. + * Given a fake-pointer created with the legacy-pointer malloc, + * destroys the buffer and remove it from the list. + */ +inline void free(void *ptr) { getPointerMapper().remove_pointer(ptr); } + +/** + *clear the pointer list + */ +inline void clear() { + getPointerMapper().clear(); +} + +} // legacy +} // codeplay diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 6ce41b0ab..94692be56 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -49,7 +49,7 @@ template struct ExecEx /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. -/// std::array does not have TotalSize. So I have to get the size throgh template specialisation. +/// std::array does not have TotalSize. So I have to get the size through template specialisation. template struct DimensionSize{ static Index getDimSize(const Dimensions& dim){ return dim.TotalSize(); diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index cbbd3efb4..cf07b033d 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -153,6 +153,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 5dacc87f2..cb8fcb74c 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -28,6 +28,7 @@ using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; +static const float error_threshold =1e-4f; typedef Tensor::DimensionPair DimPair; template void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size) @@ -70,10 +71,10 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in t_result = t_left.contract(t_right, dims); for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { + if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } std::cout << "mismatch detected at index " << i << ": " << t_result(i) @@ -132,10 +133,10 @@ void test_TF(const Device& sycl_device) t_result = t_left.contract(t_right, dims); for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) { + if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) { + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } std::cout << "mismatch detected at index " << i << ": " << t_result(i) @@ -187,8 +188,8 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) t_result = t_left.contract(t_right, dims); - if (static_cast(fabs(t_result() - t_result_gpu())) > 1e-4f && - !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { + if (static_cast(fabs(t_result() - t_result_gpu())) > error_threshold && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { std::cout << "mismatch detected: " << t_result() << " vs " << t_result_gpu() << std::endl; assert(false); diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp new file mode 100644 index 000000000..f7e0a2742 --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -0,0 +1,469 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + +#include +#include +#include + +#include "main.h" +#include +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +static const float error_threshold =1e-4f; + + +template +static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) +{ + int indim0 =53; + int indim1= 55; + int indim2= 51; + int outdim0=50; + int outdim1=55; + int outdim2=51; + Eigen::array input_dims = {{indim0, indim1, indim2}}; + Eigen::array kernel_dims = {{4}}; + Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(result_dims); + Tensor result_host(result_dims); + + Eigen::array dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(int i=0; i< outdim0; i++ ){ + for(int j=0; j< outdim1; j++ ){ + for(int k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout < +static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) +{ + int indim0 =53; + int indim1= 55; + int indim2= 51; + int outdim0=50; + int outdim1=51; + int outdim2=51; + Eigen::array input_dims = {{indim0, indim1, indim2}}; + Eigen::array kernel_dims = {{4,5}}; + Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(result_dims); + Tensor result_host(result_dims); + + Eigen::array dims3{{0,1}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(int i=0; i< outdim0; i++ ){ + for(int j=0; j< outdim1; j++ ){ + for(int k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout < +static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) +{ + int indim0 =53; + int indim1= 55; + int indim2= 51; + int outdim0=50; + int outdim1=51; + int outdim2=49; + Eigen::array input_dims = {{indim0, indim1, indim2}}; + Eigen::array kernel_dims = {{4,5,3}}; + Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(result_dims); + Tensor result_host(result_dims); + + Eigen::array dims3{{0,1,2}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(int i=0; i< outdim0; i++ ){ + for(int j=0; j< outdim1; j++ ){ + for(int k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout < +static void test_evals(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array input_dims = {{3, 3}}; + Eigen::array kernel_dims = {{2}}; + Eigen::array result_dims = {{2, 3}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(result_dims); + + Eigen::array dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + +template +static void test_expr(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array input_dims = {{3, 3}}; + Eigen::array kernel_dims = {{2, 2}}; + Eigen::array result_dims = {{2, 2}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(result_dims); + + input.setRandom(); + kernel.setRandom(); + Eigen::array dims; + dims[0] = 0; + dims[1] = 1; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + + +template +static void test_modes(const Eigen::SyclDevice& sycl_device){ + +Eigen::array input_dims = {{3}}; +Eigen::array kernel_dims = {{3}}; + +Tensor input(input_dims); +Tensor kernel(kernel_dims); + +input.setRandom(); +kernel.setRandom(); +Eigen::array dims; +dims[0] = 0; + + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + Eigen::array, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor valid(1); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t valid_bytes = valid.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_valid = static_cast(sycl_device.allocate(valid_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_valid(d_valid, valid.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes); + + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor same(3); + std::size_t same_bytes = same.size() * sizeof(DataType); + DataType * d_same = static_cast(sycl_device.allocate(same_bytes)); + Eigen::TensorMap > gpu_same(d_same, same.dimensions()); + gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes); + + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + + Tensor full(5); + std::size_t full_bytes = full.size() * sizeof(DataType); + DataType * d_full = static_cast(sycl_device.allocate(full_bytes)); + Eigen::TensorMap > gpu_full(d_full, full.dimensions()); + gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes); + + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_valid); + sycl_device.deallocate(d_same); + sycl_device.deallocate(d_full); + +} + +template +static void test_strides(const Eigen::SyclDevice& sycl_device){ + + Eigen::array input_dims = {{13}}; + Eigen::array kernel_dims = {{3}}; + + Tensor input(input_dims); + Tensor kernel(kernel_dims); + Tensor result(2); + + input.setRandom(); + kernel.setRandom(); + Eigen::array dims; + dims[0] = 0; + + Eigen::array stride_of_3; + stride_of_3[0] = 3; + Eigen::array stride_of_2; + stride_of_2[0] = 2; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap > gpu_input(d_input, input_dims); + Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap > gpu_result(d_result, result.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + +template void tensorConvolutionPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_larg_expr1D(sycl_device); + test_larg_expr1D(sycl_device); + test_larg_expr2D(sycl_device); + test_larg_expr2D(sycl_device); + test_larg_expr3D(sycl_device); + test_larg_expr3D(sycl_device); + test_evals(sycl_device); + test_evals(sycl_device); + test_expr(sycl_device); + test_expr(sycl_device); + test_modes(sycl_device); + test_modes(sycl_device); + test_strides(sycl_device); + test_strides(sycl_device); +} + +void test_cxx11_tensor_convolution_sycl() { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConvolutionPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 5992a306d..6f7e29890 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -242,9 +242,6 @@ static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ Scalar1* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(Scalar1))); Scalar2 * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(Scalar2))); - - - TensorMap> gpu_in(gpu_in_data, tensorRange); TensorMap> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); From 77cc4d06c746e7be2966bd0d09b55c2393e289d8 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 19 Jan 2017 17:06:21 +0000 Subject: [PATCH 09/54] Removing unused variables --- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 7774342d8..e2569e1bf 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -326,7 +326,7 @@ struct TensorEvaluator InputLocalAcc; @@ -348,7 +348,7 @@ struct TensorEvaluator(shared_mem) <= maxSharedMem); + assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); @@ -373,7 +373,7 @@ struct TensorEvaluator(shared_mem) <= maxSharedMem); + assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); @@ -404,7 +404,7 @@ struct TensorEvaluator(shared_mem) <= maxSharedMem); + assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index ae8a9f667..a30090714 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -386,7 +386,7 @@ struct SyclDevice { return 2;//sycl_queue().get_device(). template get_info(); // return stream_->deviceProperties().maxThreadsPerMultiProcessor; } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return sycl_queue().get_device(). template get_info(); // return stream_->deviceProperties().sharedMemPerBlock; } From 602f8c27f5307f1da966df2fc26745ecd0e78fc9 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Fri, 20 Jan 2017 18:23:20 +0000 Subject: [PATCH 10/54] Reverting back to the previous TensorDeviceSycl.h as the total number of buffer is not enough for tensorflow. --- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 83 +++--- .../src/Tensor/TensorSyclLegacyPointer.h | 244 ------------------ 2 files changed, 41 insertions(+), 286 deletions(-) delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index a30090714..722a5d894 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -15,16 +15,13 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -#include "TensorSyclLegacyPointer.h" - namespace Eigen { #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) template class MemCopyFunctor { public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) - : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} void operator()(cl::sycl::nd_item<1> itemID) { auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); @@ -55,7 +52,6 @@ namespace Eigen { }; - EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); @@ -78,10 +74,11 @@ struct QueueInterface { bool exception_caught_ = false; mutable std::mutex mutex_; + /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - //mutable std::map> buffer_map; + mutable std::map> buffer_map; /// sycl queue mutable cl::sycl::queue m_queue; /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename @@ -119,42 +116,45 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. /// The device pointer would be deleted by calling deallocate function. EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); + auto ptr =buf.get_access().get_pointer(); + buf.set_final_data(nullptr); std::lock_guard lock(mutex_); - return codeplay::legacy::malloc(num_bytes); + buffer_map.insert(std::pair>(static_cast(ptr),buf)); + return static_cast(ptr); } /// This is used to deallocate the device pointer. p is used as a key inside /// the map to find the device buffer and delete it. EIGEN_STRONG_INLINE void deallocate(void *p) const { std::lock_guard lock(mutex_); - return codeplay::legacy::free(p); + auto it = buffer_map.find(static_cast(p)); + if (it != buffer_map.end()) { + buffer_map.erase(it); + } } EIGEN_STRONG_INLINE void deallocate_all() const { std::lock_guard lock(mutex_); - codeplay::legacy::clear(); + buffer_map.clear(); } - EIGEN_STRONG_INLINE codeplay::legacy::PointerMapper& pointerMapper() const { + EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { std::lock_guard lock(mutex_); - return codeplay::legacy::getPointerMapper(); + auto it1 = buffer_map.find(static_cast(ptr)); + if (it1 != buffer_map.end()){ + return it1; + } + else{ + for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ + auto size = it->second.get_size(); + if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; + } + } + std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; + abort(); } - EIGEN_STRONG_INLINE cl::sycl::buffer get_buffer(void* ptr) const { - std::lock_guard lock(mutex_); - return pointerMapper().get_buffer(pointerMapper().get_buffer_id(ptr)); - } - - EIGEN_STRONG_INLINE size_t get_buffer_offset(void* ptr) const { - std::lock_guard lock(mutex_); - return pointerMapper().get_offset(ptr); - } - - /*EIGEN_STRONG_INLINE void* get_buffer_id(void* ptr) const { - std::lock_guard lock(mutex_); - return static_cast(pointerMapper().get_buffer_id(ptr)); - }*/ - // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { @@ -165,7 +165,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { } // destructor - ~QueueInterface() { codeplay::legacy::clear(); } + ~QueueInterface() { buffer_map.clear(); } }; struct SyclDevice { @@ -183,11 +183,10 @@ struct SyclDevice { } /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer(const void * ptr) const { - return m_queue_stream->get_buffer(const_cast(ptr)); + EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { + return m_queue_stream->find_buffer(ptr)->second; } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels template EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { @@ -274,8 +273,6 @@ struct SyclDevice { if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); } } - - /// allocate device memory EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return m_queue_stream->allocate(num_bytes); @@ -290,15 +287,17 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto offset= m_queue_stream->get_buffer_offset((void*)src); - auto i= m_queue_stream->get_buffer_offset(dst); + auto it1 = m_queue_stream->find_buffer((void*)src); + auto it2 = m_queue_stream->find_buffer(dst); + auto offset= (static_cast(static_cast(src))) - it1->first; + auto i= (static_cast(dst)) - it2->first; offset/=sizeof(Index); i/=sizeof(Index); size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =get_sycl_accessor(cgh, src); - auto dst_acc =get_sycl_accessor(cgh, dst); + auto src_acc =it1->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); typedef decltype(src_acc) read_accessor; typedef decltype(dst_acc) write_accessor; cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); @@ -311,11 +310,10 @@ struct SyclDevice { /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that /// this buffer is accessed, the data will be copied to the device. - template EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { auto host_acc= get_sycl_buffer(dst). template get_access(); ::memcpy(host_acc.get_pointer(), src, n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination @@ -323,14 +321,15 @@ struct SyclDevice { /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back /// to the cpu only once per function call. template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto offset =m_queue_stream->get_buffer_offset((void *)src); + auto it = m_queue_stream->find_buffer(src); + auto offset =static_cast(static_cast(src))- it->first; offset/=sizeof(Index); size_t rng, GRange, tileSize; parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); // Assuming that the dst is the start of the destination pointer auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= get_sycl_accessor(cgh, src); + auto src_acc= it->second.template get_access(cgh); auto dst_acc =dest_buf.template get_access(cgh); typedef decltype(src_acc) read_accessor; typedef decltype(dst_acc) write_accessor; @@ -344,8 +343,7 @@ struct SyclDevice { EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { size_t rng, GRange, tileSize; parallel_for_setup(n, tileSize, rng, GRange); - auto buf =get_sycl_buffer(static_cast(static_cast(data))); - sycl_queue().submit(memsetCghFunctor(buf,rng, GRange, tileSize, c )); + sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); synchronize(); } @@ -411,6 +409,7 @@ struct SyclDevice { }; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h deleted file mode 100644 index b723592cd..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h +++ /dev/null @@ -1,244 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Ruyman Reyes Codeplay Software Ltd -// Mehdi Goli Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclLegacyPointer.h - * - * \brief: - * Interface for SYCL buffers to behave as a non-deferrenciable pointer - * This can be found in Codeplay's ComputeCpp SDK : legacy_pointer.h - * - **************************************************************************/ - -namespace codeplay { -namespace legacy { - -/** - * PointerMapper - * Associates fake pointers with buffers. - * - */ -class PointerMapper { - public: - /* pointer information definitions - */ - static const unsigned long ADDRESS_BITS = sizeof(void *) * 8; - static const unsigned long BUFFER_ID_BITSIZE = 16u; - static const unsigned long MAX_NUMBER_BUFFERS = (1UL << BUFFER_ID_BITSIZE)-1; - static const unsigned long MAX_OFFSET = (1UL << (ADDRESS_BITS - BUFFER_ID_BITSIZE))-1; - - using base_ptr_t = uintptr_t; - - /* Fake Pointers are constructed using an integer indexing plus - * the offset: - * - * |== MAX_BUFFERS ==|======== MAX_OFFSET ========| - * | Buffer Id | Offset in buffer | - * |=================|============================| - */ - struct legacy_pointer_t { - /* Type for the pointers - */ - base_ptr_t _contents; - - /** Conversions from legacy_pointer_t to - * the void * should just reinterpret_cast the integer - * number - */ - operator void *() const { return reinterpret_cast(_contents); } - - /** - * Convert back to the integer number. - */ - operator base_ptr_t() const { return _contents; } - - /** - * Converts a void * into a legacy pointer structure. - * Note that this will only work if the void * was - * already a legacy_pointer_t, but we have no way of - * checking - */ - legacy_pointer_t(void *ptr) - : _contents(reinterpret_cast(ptr)){}; - - /** - * Creates a legacy_pointer_t from the given integer - * number - */ - legacy_pointer_t(base_ptr_t u) : _contents(u){}; - }; - - /* Whether if a pointer is null or not. - * - * A pointer is nullptr if the buffer id is 0, - * i.e the first BUFFER_ID_BITSIZE are zero - */ - static inline bool is_nullptr(legacy_pointer_t ptr) { - return ((MAX_OFFSET & ptr) == ptr); - } - - /* Base nullptr - */ - const legacy_pointer_t null_legacy_ptr = nullptr; - - /* Data type to create buffer of byte-size elements - */ - using buffer_data_type = uint8_t; - - /* basic type for all buffers - */ - using buffer_t = cl::sycl::buffer; - - /* id of a buffer in the map - */ - typedef short buffer_id; - - /* get_buffer_id - */ - inline buffer_id get_buffer_id(legacy_pointer_t ptr) const { - return ptr >> (ADDRESS_BITS - BUFFER_ID_BITSIZE); - } - - /* - * get_buffer_offset - */ - inline off_t get_offset(legacy_pointer_t ptr) const { - return ptr & MAX_OFFSET;; - } - - /** - * Constructs the PointerMapper structure. - */ - PointerMapper() - : __pointer_list{}, rng_(std::random_device()()), uni_(1, 256){}; - - /** - * PointerMapper cannot be copied or moved - */ - PointerMapper(const PointerMapper &) = delete; - - /** - * empty the pointer list - */ - inline void clear() { - __pointer_list.clear(); - } - - /* generate_id - * Generates a unique id for a buffer. - */ - buffer_id generate_id() { - // Limit the number of attempts to half the combinations - // just to avoid an infinite loop - int numberOfAttempts = 1ul << (BUFFER_ID_BITSIZE / 2); - buffer_id bId; - do { - bId = uni_(rng_); - } while (__pointer_list.find(bId) != __pointer_list.end() && - numberOfAttempts--); - return bId; - } - - /* add_pointer. - * Adds a pointer to the map and returns the fake pointer id. - * This will be the bufferId on the most significant bytes and 0 elsewhere. - */ - legacy_pointer_t add_pointer(buffer_t &&b) { - auto nextNumber = __pointer_list.size(); - buffer_id bId = generate_id(); - __pointer_list.emplace(bId, b); - if (nextNumber > MAX_NUMBER_BUFFERS) { - return null_legacy_ptr; - } - base_ptr_t retVal = bId; - retVal <<= (ADDRESS_BITS - BUFFER_ID_BITSIZE); - return retVal; - } - - /* get_buffer. - * Returns a buffer from the map using the buffer id - */ - buffer_t get_buffer(buffer_id bId) const { - auto it = __pointer_list.find(bId); - if (it != __pointer_list.end()) - return it->second; - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); - } - - /* remove_pointer. - * Removes the given pointer from the map. - */ - void remove_pointer(void *ptr) { - buffer_id bId = this->get_buffer_id(ptr); - __pointer_list.erase(bId); - } - - /* count. - * Return the number of active pointers (i.e, pointers that - * have been malloc but not freed). - */ - size_t count() const { return __pointer_list.size(); } - - private: - /* Maps the buffer id numbers to the actual buffer - * instances. - */ - std::map __pointer_list; - - /* Random number generator for the buffer ids - */ - std::mt19937 rng_; - - /* Random-number engine - */ - std::uniform_int_distribution uni_; -}; - -/** - * Singleton interface to the pointer mapper to implement - * the generic malloc/free C interface without extra - * parameters. - */ -inline PointerMapper &getPointerMapper() { - static PointerMapper thePointerMapper; - return thePointerMapper; -} - -/** - * Malloc-like interface to the pointer-mapper. - * Given a size, creates a byte-typed buffer and returns a - * fake pointer to keep track of it. - */ -inline void *malloc(size_t size) { - // Create a generic buffer of the given size - auto thePointer = getPointerMapper().add_pointer( - PointerMapper::buffer_t(cl::sycl::range<1>{size})); - // Store the buffer on the global list - return static_cast(thePointer); -} - -/** - * Free-like interface to the pointer mapper. - * Given a fake-pointer created with the legacy-pointer malloc, - * destroys the buffer and remove it from the list. - */ -inline void free(void *ptr) { getPointerMapper().remove_pointer(ptr); } - -/** - *clear the pointer list - */ -inline void clear() { - getPointerMapper().clear(); -} - -} // legacy -} // codeplay From bf44fed9b7f8f2716fd433d4121d0a91d0d84883 Mon Sep 17 00:00:00 2001 From: Luke Iwanski Date: Mon, 23 Jan 2017 15:56:45 +0000 Subject: [PATCH 11/54] Allows AMD APU --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 722a5d894..b70b2ff79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -59,7 +59,7 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) auto s= (*it).template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos){ // remove amd cpu as it is not supported by computecpp + if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs it=devices.erase(it); } else{ From 82ce92419e25d8b9902c0f39e2e3b01787bf8687 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 30 Jan 2017 11:38:20 +0000 Subject: [PATCH 12/54] Fixing the buffer type in memcpy. --- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index b70b2ff79..9858d0560 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -287,7 +287,7 @@ struct SyclDevice { /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer((void*)src); + auto it1 = m_queue_stream->find_buffer(static_cast(src)); auto it2 = m_queue_stream->find_buffer(dst); auto offset= (static_cast(static_cast(src))) - it1->first; auto i= (static_cast(dst)) - it2->first; @@ -297,7 +297,7 @@ struct SyclDevice { parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); sycl_queue().submit([&](cl::sycl::handler &cgh) { auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); + auto dst_acc =it2->second.template get_access(cgh); typedef decltype(src_acc) read_accessor; typedef decltype(dst_acc) write_accessor; cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); From c86911ac7358058aad4366a69de2db1aa5c44c49 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 30 Jan 2017 13:38:24 +0100 Subject: [PATCH 13/54] bug #1384: fix evaluation of "sparse/scalar" that used the wrong evaluation path. --- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 10 ++++++++++ test/sparse_basic.cpp | 19 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 145a7389e..c41c07af1 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -357,6 +357,16 @@ struct binary_evaluator, Lhs, Rhs>, Itera explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} }; +// "sparse ./ dense" +template +struct binary_evaluator, Lhs, Rhs>, IteratorBased, IndexBased> + : sparse_conjunction_evaluator, Lhs, Rhs> > +{ + typedef CwiseBinaryOp, Lhs, Rhs> XprType; + typedef sparse_conjunction_evaluator Base; + explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} +}; + // "sparse && sparse" template struct binary_evaluator, IteratorBased, IteratorBased> diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 91b7cb335..384985028 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp @@ -161,17 +161,21 @@ template void sparse_basic(const SparseMatrixType& re if(internal::random()) m1.makeCompressed(); + Index m1_nnz = m1.nonZeros(); + VERIFY_IS_APPROX(m1*s1, refM1*s1); VERIFY_IS_APPROX(m1+m2, refM1+refM2); VERIFY_IS_APPROX(m1+m2+m3, refM1+refM2+refM3); VERIFY_IS_APPROX(m3.cwiseProduct(m1+m2), refM3.cwiseProduct(refM1+refM2)); VERIFY_IS_APPROX(m1*s1-m2, refM1*s1-refM2); + VERIFY_IS_APPROX(m4=m1/s1, refM1/s1); + VERIFY_IS_EQUAL(m4.nonZeros(), m1_nnz); if(SparseMatrixType::IsRowMajor) VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.row(0)), refM1.row(0).dot(refM2.row(0))); else VERIFY_IS_APPROX(m1.innerVector(0).dot(refM2.col(0)), refM1.col(0).dot(refM2.col(0))); - + DenseVector rv = DenseVector::Random(m1.cols()); DenseVector cv = DenseVector::Random(m1.rows()); Index r = internal::random(0,m1.rows()-2); @@ -208,8 +212,12 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_IS_APPROX(m1.sum(), refM1.sum()); + m4 = m1; refM4 = m4; + VERIFY_IS_APPROX(m1*=s1, refM1*=s1); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); VERIFY_IS_APPROX(m1/=s1, refM1/=s1); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); VERIFY_IS_APPROX(m1+=m2, refM1+=refM2); VERIFY_IS_APPROX(m1-=m2, refM1-=refM2); @@ -220,13 +228,22 @@ template void sparse_basic(const SparseMatrixType& re VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) ); + m1 = m4; refM1 = refM4; } // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1)); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 = m1.transpose()), (refM1 = refM1.transpose().eval())); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 = -m1.transpose()), (refM1 = -refM1.transpose().eval())); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; VERIFY_IS_APPROX((m1 += -m1), (refM1 += -refM1)); + VERIFY_IS_EQUAL(m1.nonZeros(), m1_nnz); + m1 = m4; refM1 = refM4; if(m1.isCompressed()) { From 63de19c0004933c7b2b1e418292b9f2ae6c138f4 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 30 Jan 2017 13:55:27 +0100 Subject: [PATCH 14/54] bug #1380: fix matrix exponential with Map<> --- .../Eigen/src/MatrixFunctions/MatrixExponential.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 4bb1852b6..9ad2b9cc8 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -204,7 +204,8 @@ struct matrix_exp_computeUV template struct matrix_exp_computeUV { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -227,7 +228,8 @@ struct matrix_exp_computeUV template struct matrix_exp_computeUV { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { using std::frexp; using std::pow; @@ -254,7 +256,8 @@ struct matrix_exp_computeUV template struct matrix_exp_computeUV { - static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings) + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) { #if LDBL_MANT_DIG == 53 // double precision matrix_exp_computeUV::run(arg, U, V, squarings); @@ -351,11 +354,11 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result) return; } #endif - MatrixType U, V; + typename MatrixType::PlainObject U, V; int squarings; matrix_exp_computeUV::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - MatrixType numer = U + V; - MatrixType denom = -U + V; + typename MatrixType::PlainObject numer = U + V; + typename MatrixType::PlainObject denom = -U + V; result = denom.partialPivLu().solve(numer); for (int i=0; i Date: Tue, 31 Jan 2017 14:22:42 +0100 Subject: [PATCH 15/54] bug #478: fix regression in the eigen decomposition of zero matrices. --- Eigen/src/Eigenvalues/ComplexEigenSolver.h | 6 ++++-- Eigen/src/Eigenvalues/RealSchur.h | 12 ++++++++++++ test/eigensolver_complex.cpp | 9 +++++++++ test/eigensolver_generic.cpp | 9 +++++++++ test/eigensolver_selfadjoint.cpp | 9 +++++++++ 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index ec3b1633e..dc5fae06a 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -250,7 +250,7 @@ template class ComplexEigenSolver EigenvectorType m_matX; private: - void doComputeEigenvectors(const RealScalar& matrixnorm); + void doComputeEigenvectors(RealScalar matrixnorm); void sortEigenvalues(bool computeEigenvectors); }; @@ -284,10 +284,12 @@ ComplexEigenSolver::compute(const EigenBase& matrix, bool template -void ComplexEigenSolver::doComputeEigenvectors(const RealScalar& matrixnorm) +void ComplexEigenSolver::doComputeEigenvectors(RealScalar matrixnorm) { const Index n = m_eivalues.size(); + matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits::min)()); + // Compute X such that T = X D X^(-1), where D is the diagonal of T. // The matrix X is unit triangular. m_matX = EigenvectorType::Zero(n, n); diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index d6a339f07..f5c86041d 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -248,12 +248,24 @@ template template RealSchur& RealSchur::compute(const EigenBase& matrix, bool computeU) { + const Scalar considerAsZero = (std::numeric_limits::min)(); + eigen_assert(matrix.cols() == matrix.rows()); Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrix.rows(); Scalar scale = matrix.derived().cwiseAbs().maxCoeff(); + if(scale void eigensolver(const MatrixType& m) ComplexEigenSolver eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + ComplexEigenSolver ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template void eigensolver_verify_assert(const MatrixType& m) diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index e18fbf687..d0e644d4b 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -76,6 +76,15 @@ template void eigensolver(const MatrixType& m) EigenSolver eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + EigenSolver ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template void eigensolver_verify_assert(const MatrixType& m) diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 4ed126116..39ad4130e 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -180,6 +180,15 @@ template void selfadjointeigensolver(const MatrixType& m) SelfAdjointEigenSolver eig(a.adjoint() * a); eig.compute(a.adjoint() * a); } + + // regression test for bug 478 + { + a.setZero(); + SelfAdjointEigenSolver ei3(a); + VERIFY_IS_EQUAL(ei3.info(), Success); + VERIFY_IS_MUCH_SMALLER_THAN(ei3.eigenvalues().norm(),RealScalar(1)); + VERIFY((ei3.eigenvectors().transpose()*ei3.eigenvectors().transpose()).eval().isIdentity()); + } } template From 48a20b7d956433713a39e04d39cba443b7a763de Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 31 Jan 2017 14:06:36 +0000 Subject: [PATCH 16/54] Fixing compiler error on TensorContractionSycl.h; Silencing the compiler unused parameter warning for eval_op_indices in TensorContraction.h --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 828db6d8b..71f086426 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -682,6 +682,9 @@ protected: } m_can_use_xsmm = true; + #else + // silence the compiler warning + (void) eval_op_indices; #endif } @@ -842,10 +845,6 @@ protected: TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; - - /// required for sycl - const Indices m_expr_indices; - bool m_can_use_xsmm; }; From 645a8e32a556f2dff312c7c31d3622709d4960ad Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 31 Jan 2017 16:22:54 +0100 Subject: [PATCH 17/54] Fix compilation of JacobiSVD for vectors type --- Eigen/src/SVD/JacobiSVD.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index e0cfb6283..1337ae987 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -200,10 +200,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix + typedef Matrix TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) From bab29936a1cf0a68ffe4ccb1fd9b4807a3ec87ae Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Feb 2017 15:29:53 +0000 Subject: [PATCH 18/54] Reducing warnings in Sycl backend. --- cmake/FindComputeCpp.cmake | 2 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 113 +++++----- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 18 +- .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 5 +- .../test/cxx11_tensor_broadcast_sycl.cpp | 5 - .../test/cxx11_tensor_builtins_sycl.cpp | 169 +++++++-------- .../test/cxx11_tensor_concatenation_sycl.cpp | 106 +++++----- .../test/cxx11_tensor_contract_sycl.cpp | 198 +++++++++--------- .../test/cxx11_tensor_convolution_sycl.cpp | 84 ++++---- unsupported/test/cxx11_tensor_device_sycl.cpp | 28 +-- .../test/cxx11_tensor_forced_eval_sycl.cpp | 34 +-- .../test/cxx11_tensor_morphing_sycl.cpp | 119 +++++------ .../test/cxx11_tensor_padding_sycl.cpp | 22 +- .../test/cxx11_tensor_reduction_sycl.cpp | 88 ++++---- .../test/cxx11_tensor_reverse_sycl.cpp | 110 +++++----- .../test/cxx11_tensor_shuffling_sycl.cpp | 49 ++--- .../test/cxx11_tensor_striding_sycl.cpp | 42 ++-- unsupported/test/cxx11_tensor_sycl.cpp | 144 ++++++------- 19 files changed, 666 insertions(+), 672 deletions(-) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 07ebed61b..27e5c9b1f 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -138,7 +138,7 @@ else() message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") endif() -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) +set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) # Check if the platform is supported execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index dc16f89e0..e87de0c57 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -22,7 +22,7 @@ #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H namespace Eigen { -template struct LaunchSyclKernels; +template struct LaunchSyclKernels; template struct TensorEvaluator, const Eigen::SyclDevice> : public TensorContractionEvaluatorBase, const Eigen::SyclDevice> > { @@ -146,7 +146,7 @@ struct TensorEvaluatorm_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels::Run(*this, buffer, m, n, k, + LaunchSyclKernels::Run(*this, buffer, m, n, k, this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); } @@ -162,8 +162,8 @@ struct TensorEvaluator struct KernelConstructor{ +typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN, +typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type LHSPlaceHolderExpr; @@ -224,84 +224,83 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); // Matmul Kernel // Thread identifiers - const int mLocalThreadId = itemID.get_local(0); // Local ID row - const int nLocalThreadId = itemID.get_local(1); // Local ID col - const int mGroupId = itemID.get_group(0); // Work-group ID row - const int nGroupId = itemID.get_group(1); // Work-group ID localCol - const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID + const Index mLocalThreadId = itemID.get_local(0); // Local ID row + const Index nLocalThreadId = itemID.get_local(1); // Local ID col + const Index mGroupId = itemID.get_group(0); // Work-group ID row + const Index nGroupId = itemID.get_group(1); // Work-group ID localCol + const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID // Allocate register space float privateLhs; float privateRhs[WorkLoadPerThreadN]; float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; // Initialise the privateResumulation registers - for (int wLPTM=0; wLPTM(0); } // Tile Rhs - for (int lPTR=0; lPTR(0); } // Loop over all tiles - const int numTiles = roundUpK/TileSizeDimK; - int firstHalf=0; + const Index numTiles = roundUpK/TileSizeDimK; + Index firstHalf=0; do { // Synchronise itemID.barrier(cl::sycl::access::fence_space::local_space); // Load the next tile of Lhs and Rhs into local memory - int nextHalf = firstHalf + 1; + Index nextHalf = firstHalf + 1; if (nextHalf < numTiles) { // Tile A - for (int lPTL=0; lPTL(0); } // Tile B - for (int lPTR=0; lPTR(0); } } // Loop over the values of a single tile - for (int k=0; k struct LaunchSyclKernels { +template struct LaunchSyclKernels { -static const int TileSizeDimM = 32; // Tile size for dimension M -static const int TileSizeDimN = 32; // Tile size for dimension N -static const int TileSizeDimK = 16; // Tile size for dimension K -static const int WorkLoadPerThreadM = 4; // Work load per thread in dimension M -static const int WorkLoadPerThreadN = 4; // work load per thread in dimension N -static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression +static const Index TileSizeDimM = 32ul; // Tile size for dimension M +static const Index TileSizeDimN = 32ul; // Tile size for dimension N +static const Index TileSizeDimK = 16ul; // Tile size for dimension K +static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M +static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N +static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) +static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) +static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression +static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression // RoundUp function to make sure that the global threadId is divisable by local threadId -static int RoundUp(int x, int y) { +static Index RoundUp(Index x, Index y) { return ((((x) + (y) - 1) / (y))*(y)); } -template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT> +template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index e2569e1bf..c3e095b8a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -352,7 +352,7 @@ struct TensorEvaluator(GRange_x, GRange_y); // global range auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array indices{m_indices[0]}; + const array indices{{m_indices[0]}}; const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 9858d0560..e209799bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -194,7 +194,7 @@ struct SyclDevice { auto s= sycl_queue().get_device().template get_info(); std::transform(s.begin(), s.end(), s.begin(), ::tolower); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); + tileSize=std::min(static_cast(256), static_cast(tileSize)); } rng = n; if (rng==0) rng=static_cast(1); @@ -211,10 +211,10 @@ struct SyclDevice { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); } - size_t pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); rng1=dim1; if (rng1==0 ) rng1=static_cast(1); GRange1=rng1; @@ -241,10 +241,10 @@ struct SyclDevice { EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); + max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); } - size_t pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); rng2=dim2; if (rng2==0 ) rng1=static_cast(1); GRange2=rng2; @@ -253,8 +253,8 @@ struct SyclDevice { Index xMode = static_cast(GRange2 % tileSize2); if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); + pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); + tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); rng1=dim1; if (rng1==0 ) rng1=static_cast(1); GRange1=rng1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 94692be56..cac785540 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -50,10 +50,9 @@ template struct ExecEx /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. /// std::array does not have TotalSize. So I have to get the size through template specialisation. -template struct DimensionSize{ - static Index getDimSize(const Dimensions& dim){ +template struct DimensionSize{ + static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){ return dim.TotalSize(); - } }; #define DIMSIZEMACRO(CVQual)\ diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index c426549f1..21fdfca22 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -131,11 +131,6 @@ template void sycl_broadcast_test_per_device(const cl::sycl:: std::cout << "Running on " << d.template get_info() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - - test_broadcast_sycl_fixed(sycl_device); - test_broadcast_sycl(sycl_device); - test_broadcast_sycl_fixed(sycl_device); - test_broadcast_sycl(sycl_device); test_broadcast_sycl(sycl_device); test_broadcast_sycl(sycl_device); test_broadcast_sycl_fixed(sycl_device); diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index d5193d1ea..400a31d09 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -32,20 +32,20 @@ template T cube(T x) { return x * x * x; } template T inverse(T x) { return 1 / x; } } -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR) \ +#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ { \ /* out OPERATOR in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ + Tensor in(tensorRange); \ + Tensor out(tensorRange); \ in = in.random() + static_cast(0.01); \ out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ + Tensor reference(out); \ SCALAR *gpu_data = static_cast( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu(gpu_data, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ @@ -53,7 +53,7 @@ template T inverse(T x) { return 1 / x; } gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(in(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -63,18 +63,18 @@ template T inverse(T x) { return 1 / x; } } \ { \ /* out OPERATOR out.FUNC() */ \ - Tensor out(tensorRange); \ + Tensor out(tensorRange); \ out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ + Tensor reference(out); \ SCALAR *gpu_data_out = static_cast( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ (out.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver OPERATOR std::FUNC(reference(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -82,61 +82,62 @@ template T inverse(T x) { return 1 / x; } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR) +#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ + TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC) \ +#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ { \ /* out = in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ + Tensor in(tensorRange); \ + Tensor out(tensorRange); \ in = in.random() + static_cast(0.01); \ SCALAR *gpu_data = static_cast( \ sycl_device.allocate(in.size() * sizeof(SCALAR))); \ bool *gpu_data_out = \ static_cast(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu(gpu_data, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ (in.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu.FUNC(); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(bool)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ } \ sycl_device.deallocate(gpu_data); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_UNARY_BUILTINS(SCALAR) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf) +#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ + TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ + TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float) + TEST_UNARY_BUILTINS(float, RowMajor) + TEST_UNARY_BUILTINS(float, ColMajor) } namespace std { @@ -144,24 +145,24 @@ template T cwiseMax(T x, T y) { return std::max(x, y); } template T cwiseMin(T x, T y) { return std::min(x, y); } } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC) \ +#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ { \ /* out = in_1.FUNC(in_2) */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ + Tensor in_1(tensorRange); \ + Tensor in_2(tensorRange); \ + Tensor out(tensorRange); \ in_1 = in_1.random() + static_cast(0.01); \ in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ + Tensor reference(out); \ SCALAR *gpu_data_1 = static_cast( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu_1(gpu_data_1, tensorRange); \ + TensorMap> gpu_2(gpu_data_2, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -169,7 +170,7 @@ template T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ SCALAR ver = reference(i); \ ver = std::FUNC(in_1(i), in_2(i)); \ VERIFY_IS_APPROX(out(i), ver); \ @@ -179,24 +180,24 @@ template T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR in_2 */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ + Tensor in_1(tensorRange); \ + Tensor in_2(tensorRange); \ + Tensor out(tensorRange); \ in_1 = in_1.random() + static_cast(0.01); \ in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ + Tensor reference(out); \ SCALAR *gpu_data_1 = static_cast( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_2 = static_cast( \ sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu_1(gpu_data_1, tensorRange); \ + TensorMap> gpu_2(gpu_data_2, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ @@ -204,7 +205,7 @@ template T cwiseMin(T x, T y) { return std::min(x, y); } gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ } \ sycl_device.deallocate(gpu_data_1); \ @@ -212,46 +213,48 @@ template T cwiseMin(T x, T y) { return std::min(x, y); } sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR) \ +#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ { \ /* out = in_1 OPERATOR 2 */ \ - Tensor in_1(tensorRange); \ - Tensor out(tensorRange); \ + Tensor in_1(tensorRange); \ + Tensor out(tensorRange); \ in_1 = in_1.random() + static_cast(0.01); \ - Tensor reference(out); \ + Tensor reference(out); \ SCALAR *gpu_data_1 = static_cast( \ sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ SCALAR *gpu_data_out = static_cast( \ sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ + TensorMap> gpu_1(gpu_data_1, tensorRange); \ + TensorMap> gpu_out(gpu_data_out, tensorRange); \ sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ (in_1.size()) * sizeof(SCALAR)); \ gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ (out.size()) * sizeof(SCALAR)); \ - for (int i = 0; i < out.size(); ++i) { \ + for (int64_t i = 0; i < out.size(); ++i) { \ VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ } \ sycl_device.deallocate(gpu_data_1); \ sycl_device.deallocate(gpu_data_out); \ } -#define TEST_BINARY_BUILTINS(SCALAR) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /) +#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ + TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ + TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 10; - int sizeDim2 = 10; - int sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %) + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + TEST_BINARY_BUILTINS(float, RowMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) + TEST_BINARY_BUILTINS(float, ColMajor) + TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) } void test_cxx11_tensor_builtins_sycl() { diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp index 5a324b44c..e3023a368 100644 --- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,39 +22,39 @@ using Eigen::Tensor; -template +template static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Index leftDim3 = 1; - Eigen::array leftRange = {{leftDim1, leftDim2, leftDim3}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Index rightDim3 = 1; - Eigen::array rightRange = {{rightDim1, rightDim2, rightDim3}}; + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + IndexType leftDim3 = 1; + Eigen::array leftRange = {{leftDim1, leftDim2, leftDim3}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + IndexType rightDim3 = 1; + Eigen::array rightRange = {{rightDim1, rightDim2, rightDim3}}; - //Index concatDim1 = 3; -// Index concatDim2 = 3; -// Index concatDim3 = 1; - //Eigen::array concatRange = {{concatDim1, concatDim2, concatDim3}}; + //IndexType concatDim1 = 3; +// IndexType concatDim2 = 3; +// IndexType concatDim3 = 1; + //Eigen::array concatRange = {{concatDim1, concatDim2, concatDim3}}; - Tensor left(leftRange); - Tensor right(rightRange); + Tensor left(leftRange); + Tensor right(rightRange); left.setRandom(); right.setRandom(); DataType * gpu_in1_data = static_cast(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); /// - Tensor concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + Tensor concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); DataType * gpu_out_data1 = static_cast(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + Eigen::TensorMap> gpu_out1(gpu_out_data1, concatenation1.dimensions()); //concatenation = left.concatenate(right, 0); gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); @@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); - for (int j = 0; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); } - for (int i = 2; i < 4; ++i) { + for (IndexType i = 2; i < 4; ++i) { VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); } } sycl_device.deallocate(gpu_out_data1); - Tensor concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + Tensor concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); DataType * gpu_out_data2 = static_cast(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + Eigen::TensorMap> gpu_out2(gpu_out_data2, concatenation2.dimensions()); gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); @@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); } - for (int j = 3; j < 6; ++j) { + for (IndexType j = 3; j < 6; ++j) { VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); } } sycl_device.deallocate(gpu_out_data2); - Tensor concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + Tensor concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); DataType * gpu_out_data3 = static_cast(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + Eigen::TensorMap> gpu_out3(gpu_out_data3, concatenation3.dimensions()); gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); @@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); } @@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) sycl_device.deallocate(gpu_in1_data); sycl_device.deallocate(gpu_in2_data); } -template +template static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) { - Index leftDim1 = 2; - Index leftDim2 = 3; - Eigen::array leftRange = {{leftDim1, leftDim2}}; + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + Eigen::array leftRange = {{leftDim1, leftDim2}}; - Index rightDim1 = 2; - Index rightDim2 = 3; - Eigen::array rightRange = {{rightDim1, rightDim2}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + Eigen::array rightRange = {{rightDim1, rightDim2}}; - Index concatDim1 = 4; - Index concatDim2 = 3; - Eigen::array resRange = {{concatDim1, concatDim2}}; + IndexType concatDim1 = 4; + IndexType concatDim2 = 3; + Eigen::array resRange = {{concatDim1, concatDim2}}; - Tensor left(leftRange); - Tensor right(rightRange); - Tensor result(resRange); + Tensor left(leftRange); + Tensor right(rightRange); + Tensor result(resRange); left.setRandom(); right.setRandom(); @@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType * gpu_out_data = static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); - Eigen::TensorMap> gpu_out(gpu_out_data, resRange); + Eigen::TensorMap> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap> gpu_out(gpu_out_data, resRange); sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); @@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { VERIFY_IS_EQUAL(left(i, j), result(i, j)); VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); } @@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) template void tensorConcat_perDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_concatenation(sycl_device); - test_simple_concatenation(sycl_device); - test_concatenation_as_lvalue(sycl_device); + test_simple_concatenation(sycl_device); + test_simple_concatenation(sycl_device); + test_concatenation_as_lvalue(sycl_device); } void test_cxx11_tensor_concatenation_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index cb8fcb74c..41acd5579 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include @@ -28,39 +28,39 @@ using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -static const float error_threshold =1e-4f; -typedef Tensor::DimensionPair DimPair; -template -void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size) +template +void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor t_left(m_size, k_size); - Tensor t_right(k_size, n_size); - Tensor t_result(m_size, n_size); - Tensor t_result_gpu(m_size, n_size); + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result(m_size, n_size); + Tensor t_result_gpu(m_size, n_size); // Eigen::array dims(DimPair(1, 0)); Eigen::array dims = {{DimPair(1, 0)}}; - Eigen::array left_dims = {{m_size, k_size}}; - Eigen::array right_dims = {{k_size, n_size}}; - Eigen::array result_dims = {{m_size, n_size}}; + Eigen::array left_dims = {{m_size, k_size}}; + Eigen::array right_dims = {{k_size, n_size}}; + Eigen::array result_dims = {{m_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size() * sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - float * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result, result_dims); + Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap > gpu_t_result(d_t_result, result_dims); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -70,14 +70,14 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in t_result = t_left.contract(t_right, dims); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) << " vs " << t_result_gpu(i) << std::endl; assert(false); } @@ -86,19 +86,21 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in sycl_device.deallocate(d_t_result); } -template +template void test_TF(const Device& sycl_device) { - Eigen::array left_dims = {{2, 3}}; - Eigen::array right_dims = {{3, 1}}; - Eigen::array res_dims = {{2, 1}}; + typedef typename Tensor::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Eigen::array left_dims = {{2, 3}}; + Eigen::array right_dims = {{3, 1}}; + Eigen::array res_dims = {{2, 1}}; Eigen::array dims = {{DimPair(1, 0)}}; - Tensor t_left(left_dims); - Tensor t_right(right_dims); - Tensor t_result_gpu(res_dims); - Tensor t_result(res_dims); + Tensor t_left(left_dims); + Tensor t_right(right_dims); + Tensor t_result_gpu(res_dims); + Tensor t_result(res_dims); t_left.data()[0] = 1.0f; t_left.data()[1] = 2.0f; @@ -111,18 +113,18 @@ void test_TF(const Device& sycl_device) t_right.data()[1] = 0.5f; t_right.data()[2] = 2.0f; - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = t_result.size()*sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size()*sizeof(DataType); - float * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result, res_dims); + Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap > gpu_t_result(d_t_result, res_dims); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -132,14 +134,14 @@ void test_TF(const Device& sycl_device) t_result = t_left.contract(t_right, dims); - for (DenseIndex i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { continue; } if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { continue; } - std::cout << "mismatch detected at index " << i << ": " << t_result(i) + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) << " vs " << t_result_gpu(i) << std::endl; assert(false); } @@ -150,35 +152,37 @@ void test_TF(const Device& sycl_device) } -template -void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) +template +void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) { //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - Tensor t_left(m_size, k_size); - Tensor t_right(k_size, n_size); - Tensor t_result; - Tensor t_result_gpu; + typedef typename Tensor::DimensionPair DimPair; + static const DataType error_threshold =1e-4f; + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result; + Tensor t_result_gpu; Eigen::array dims = {{DimPair(0, 0), DimPair(1, 1)}}; - Eigen::array left_dims = {{m_size, k_size}}; - Eigen::array right_dims = {{k_size, n_size}}; + Eigen::array left_dims = {{m_size, k_size}}; + Eigen::array right_dims = {{k_size, n_size}}; t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(float); - std::size_t t_right_bytes = t_right.size() * sizeof(float); - std::size_t t_result_bytes = sizeof(float); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = sizeof(DataType); - float * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - float * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - float * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); + DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); + DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result); + Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap > gpu_t_result(d_t_result); sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); @@ -188,7 +192,7 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) t_result = t_left.contract(t_right, dims); - if (static_cast(fabs(t_result() - t_result_gpu())) > error_threshold && + if (static_cast(fabs(t_result() - t_result_gpu())) > error_threshold && !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { std::cout << "mismatch detected: " << t_result() << " vs " << t_result_gpu() << std::endl; @@ -201,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size) } -template +template void test_sycl_contraction_m(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, k, 128, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, k, 128, 128); } } -template +template void test_sycl_contraction_k(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, 128, k, 128); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, 128, k, 128); } } -template +template void test_sycl_contraction_n(const Device& sycl_device) { - for (int k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, 128, 128, k); + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, 128, 128, k); } } -template +template void test_sycl_contraction_sizes(const Device& sycl_device) { - int m_sizes[] = { 31, 39, 63, 64, 65, + IndexType m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; - int n_sizes[] = { 31, 39, 63, 64, 65, + IndexType n_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - int k_sizes[] = { 31, 39, 63, 64, 65, + IndexType k_sizes[] = { 31, 39, 63, 64, 65, 95, 96, 127, 129, 255, 257, 511, 512, 513, 1023, 1024, 1025}; - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_sycl_contraction(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); } } } @@ -250,26 +254,26 @@ void test_sycl_contraction_sizes(const Device& sycl_device) { template void tensorContractionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); + test_sycl_contraction(sycl_device, 32, 32, 32); + test_sycl_contraction(sycl_device, 32, 32, 32); + test_scalar(sycl_device, 32, 32, 32); + test_scalar(sycl_device, 32, 32, 32); std::chrono::time_point start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_TF(sycl_device); - test_TF(sycl_device); + test_sycl_contraction(sycl_device, 128, 128, 128); + test_sycl_contraction(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 128, 128, 128); + test_sycl_contraction_m(sycl_device); + test_sycl_contraction_m(sycl_device); + test_sycl_contraction_n(sycl_device); + test_sycl_contraction_n(sycl_device); + test_sycl_contraction_k(sycl_device); + test_sycl_contraction_k(sycl_device); + test_sycl_contraction_sizes(sycl_device); + test_sycl_contraction_sizes(sycl_device); + test_TF(sycl_device); + test_TF(sycl_device); end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp index f7e0a2742..a4226a63a 100644 --- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include @@ -35,12 +35,12 @@ static const float error_threshold =1e-4f; template static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=55; - int outdim2=51; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=55; + IndexType outdim2=51; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; @@ -76,9 +76,9 @@ static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=51; - int outdim2=51; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=51; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4,5}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; @@ -137,9 +137,9 @@ static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) { - int indim0 =53; - int indim1= 55; - int indim2= 51; - int outdim0=50; - int outdim1=51; - int outdim2=49; + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=49; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4,5,3}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; @@ -198,9 +198,9 @@ static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) result_host=input.convolve(kernel, dims3); -for(int i=0; i< outdim0; i++ ){ - for(int j=0; j< outdim1; j++ ){ - for(int k=0; k< outdim2; k++ ){ +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < void tensorConvolutionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_larg_expr1D(sycl_device); - test_larg_expr1D(sycl_device); - test_larg_expr2D(sycl_device); - test_larg_expr2D(sycl_device); - test_larg_expr3D(sycl_device); - test_larg_expr3D(sycl_device); - test_evals(sycl_device); - test_evals(sycl_device); - test_expr(sycl_device); - test_expr(sycl_device); - test_modes(sycl_device); - test_modes(sycl_device); - test_strides(sycl_device); - test_strides(sycl_device); + test_larg_expr1D(sycl_device); + test_larg_expr1D(sycl_device); + test_larg_expr2D(sycl_device); + test_larg_expr2D(sycl_device); + test_larg_expr3D(sycl_device); + test_larg_expr3D(sycl_device); + test_evals(sycl_device); + test_evals(sycl_device); + test_expr(sycl_device); + test_expr(sycl_device); + test_modes(sycl_device); + test_modes(sycl_device); + test_strides(sycl_device); + test_strides(sycl_device); } void test_cxx11_tensor_convolution_sycl() { diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 190dba862..3ecc68df0 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -22,35 +22,35 @@ #include #include -template +template void test_device_memory(const Eigen::SyclDevice &sycl_device) { std::cout << "Running on : " << sycl_device.sycl_queue().get_device(). template get_info() < tensorRange = {{sizeDim1}}; - Tensor in(tensorRange); - Tensor in1(tensorRange); + IndexType sizeDim1 = 100; + array tensorRange = {{sizeDim1}}; + Tensor in(tensorRange); + Tensor in1(tensorRange); memset(in1.data(), 1, in1.size() * sizeof(DataType)); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(DataType))); sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); - for (int i=0; i +template void test_device_exceptions(const Eigen::SyclDevice &sycl_device) { VERIFY(sycl_device.ok()); - int sizeDim1 = 100; - array tensorDims = {{sizeDim1}}; + IndexType sizeDim1 = 100; + array tensorDims = {{sizeDim1}}; DataType* gpu_data = static_cast(sycl_device.allocate(sizeDim1*sizeof(DataType))); sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType)); - TensorMap> in(gpu_data, tensorDims); - TensorMap> out(gpu_data, tensorDims); + TensorMap> in(gpu_data, tensorDims); + TensorMap> out(gpu_data, tensorDims); out.device(sycl_device) = in / in.constant(0); sycl_device.synchronize(); @@ -62,8 +62,8 @@ template void sycl_device_test_per_device(const cl::sycl::dev std::cout << "Running on " << d.template get_info() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_device_memory(sycl_device); - test_device_memory(sycl_device); + test_device_memory(sycl_device); + test_device_memory(sycl_device); /// this test throw an exception. enable it if you want to see the exception //test_device_exceptions(sycl_device); /// this test throw an exception. enable it if you want to see the exception diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 4d19a3b2a..aca036cde 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -14,23 +14,23 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include using Eigen::Tensor; -template +template void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 20; - int sizeDim3 = 20; - Eigen::array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Eigen::Tensor in1(tensorRange); - Eigen::Tensor in2(tensorRange); - Eigen::Tensor out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 20; + IndexType sizeDim3 = 20; + Eigen::array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Eigen::Tensor in1(tensorRange); + Eigen::Tensor in2(tensorRange); + Eigen::Tensor out(tensorRange); DataType * gpu_in1_data = static_cast(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_in2_data = static_cast(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); @@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { in2 = in2.random() + in2.constant(10.0f); // creating TensorMap from tensor - Eigen::TensorMap> gpu_in1(gpu_in1_data, tensorRange); - Eigen::TensorMap> gpu_in2(gpu_in2_data, tensorRange); - Eigen::TensorMap> gpu_out(gpu_out_data, tensorRange); + Eigen::TensorMap> gpu_in1(gpu_in1_data, tensorRange); + Eigen::TensorMap> gpu_in2(gpu_in2_data, tensorRange); + Eigen::TensorMap> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k)); } @@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { template void tensorForced_evalperDevice(Dev_selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_forced_eval_sycl(sycl_device); - test_forced_eval_sycl(sycl_device); + test_forced_eval_sycl(sycl_device); + test_forced_eval_sycl(sycl_device); } void test_cxx11_tensor_forced_eval_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 91353b81a..9b521bc6b 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,18 +28,18 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template +template static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) { - typename Tensor::Dimensions dim1(2,3,1,7,1); - typename Tensor::Dimensions dim2(2,3,7); - typename Tensor::Dimensions dim3(6,7); - typename Tensor::Dimensions dim4(2,21); + typename Tensor::Dimensions dim1(2,3,1,7,1); + typename Tensor::Dimensions dim2(2,3,7); + typename Tensor::Dimensions dim3(6,7); + typename Tensor::Dimensions dim4(2,21); - Tensor tensor1(dim1); - Tensor tensor2(dim2); - Tensor tensor3(dim3); - Tensor tensor4(dim4); + Tensor tensor1(dim1); + Tensor tensor2(dim2); + Tensor tensor3(dim3); + Tensor tensor4(dim4); tensor1.setRandom(); @@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) DataType* gpu_data3 = static_cast(sycl_device.allocate(tensor3.size()*sizeof(DataType))); DataType* gpu_data4 = static_cast(sycl_device.allocate(tensor4.size()*sizeof(DataType))); - TensorMap> gpu1(gpu_data1, dim1); - TensorMap> gpu2(gpu_data2, dim2); - TensorMap> gpu3(gpu_data3, dim3); - TensorMap> gpu4(gpu_data4, dim4); + TensorMap> gpu1(gpu_data1, dim1); + TensorMap> gpu2(gpu_data2, dim2); + TensorMap> gpu3(gpu_data3, dim3); + TensorMap> gpu4(gpu_data4, dim4); sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); @@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4); sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor if (static_cast(DataLayout) == static_cast(ColMajor)) { VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor @@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) } -template +template static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) { - typename Tensor::Dimensions dim1(2,3,7); - typename Tensor::Dimensions dim2(6,7); - typename Tensor::Dimensions dim3(2,3,1,7,1); - Tensor tensor(dim1); - Tensor tensor2d(dim2); - Tensor tensor5d(dim3); + typename Tensor::Dimensions dim1(2,3,7); + typename Tensor::Dimensions dim2(6,7); + typename Tensor::Dimensions dim3(2,3,1,7,1); + Tensor tensor(dim1); + Tensor tensor2d(dim2); + Tensor tensor5d(dim3); tensor.setRandom(); @@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2d.size()*sizeof(DataType))); DataType* gpu_data3 = static_cast(sycl_device.allocate(tensor5d.size()*sizeof(DataType))); - TensorMap< Tensor > gpu1(gpu_data1, dim1); - TensorMap< Tensor > gpu2(gpu_data2, dim2); - TensorMap< Tensor > gpu3(gpu_data3, dim3); + TensorMap< Tensor > gpu1(gpu_data1, dim1); + TensorMap< Tensor > gpu2(gpu_data2, dim2); + TensorMap< Tensor > gpu3(gpu_data3, dim3); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); @@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i){ - for (int j = 0; j < 3; ++j){ - for (int k = 0; k < 7; ++k){ + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); if (static_cast(DataLayout) == static_cast(ColMajor)) { VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor @@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) } -template +template static void test_simple_slice(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 2; - int sizeDim2 = 3; - int sizeDim3 = 5; - int sizeDim4 = 7; - int sizeDim5 = 11; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - Tensor tensor(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor tensor(tensorRange); tensor.setRandom(); - array slice1_range ={{1, 1, 1, 1, 1}}; - Tensor slice1(slice1_range); + array slice1_range ={{1, 1, 1, 1, 1}}; + Tensor slice1(slice1_range); DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast(sycl_device.allocate(slice1.size()*sizeof(DataType))); - TensorMap> gpu1(gpu_data1, tensorRange); - TensorMap> gpu2(gpu_data2, slice1_range); - Eigen::DSizes indices(1,2,3,4,5); - Eigen::DSizes sizes(1,1,1,1,1); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, slice1_range); + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.slice(indices, sizes); sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - array slice2_range ={{1,1,2,2,3}}; - Tensor slice2(slice2_range); + array slice2_range ={{1,1,2,2,3}}; + Tensor slice2(slice2_range); DataType* gpu_data3 = static_cast(sycl_device.allocate(slice2.size()*sizeof(DataType))); - TensorMap> gpu3(gpu_data3, slice2_range); - Eigen::DSizes indices2(1,1,3,4,5); - Eigen::DSizes sizes2(1,1,2,2,3); + TensorMap> gpu3(gpu_data3, slice2_range); + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 3; ++k) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); } } @@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); - for(int i=0;i void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_slice(sycl_device); - test_simple_slice(sycl_device); - test_simple_reshape(sycl_device); - test_simple_reshape(sycl_device); - test_reshape_as_lvalue(sycl_device); - test_reshape_as_lvalue(sycl_device); + test_simple_slice(sycl_device); + test_simple_slice(sycl_device); + test_simple_reshape(sycl_device); + test_simple_reshape(sycl_device); + test_reshape_as_lvalue(sycl_device); + test_reshape_as_lvalue(sycl_device); test_strided_slice_write_sycl(sycl_device); test_strided_slice_write_sycl(sycl_device); } diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp index 9e86e4b52..dc748b73e 100644 --- a/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device) sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); gpu2.device(sycl_device)=gpu1.pad(paddings); sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); - for (int i = 0; i < padedSizeDim1; ++i) { - for (int j = 0; j < padedSizeDim2; ++j) { - for (int k = 0; k < padedSizeDim3; ++k) { - for (int l = 0; l < padedSizeDim4; ++l) { + for (IndexType i = 0; i < padedSizeDim1; ++i) { + for (IndexType j = 0; j < padedSizeDim2; ++j) { + for (IndexType k = 0; k < padedSizeDim3; ++k) { + for (IndexType l = 0; l < padedSizeDim4; ++l) { if (j >= 2 && j < 5 && k >= 3 && k < 8) { VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); } else { @@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 6; ++j) { - for (int k = 0; k < 12; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 6; ++j) { + for (IndexType k = 0; k < 12; ++k) { + for (IndexType l = 0; l < 7; ++l) { const float result_value = DataLayout == ColMajor ? result(i+2*j,k+12*l) : result(j+6*i,l+7*k); if (j >= 2 && j < 5 && k >= 3 && k < 8) { @@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device) template void sycl_padding_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_padding(sycl_device); - test_simple_padding(sycl_device); - test_padded_expr(sycl_device); - test_padded_expr(sycl_device); test_simple_padding(sycl_device); test_simple_padding(sycl_device); test_padded_expr(sycl_device); diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 941469029..98a59a14c 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -14,23 +14,23 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include -template +template static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - const int num_rows = 452; - const int num_cols = 765; - array tensorRange = {{num_rows, num_cols}}; + const IndexType num_rows = 452; + const IndexType num_cols = 765; + array tensorRange = {{num_rows, num_cols}}; - Tensor in(tensorRange); - Tensor full_redux; - Tensor full_redux_gpu; + Tensor in(tensorRange); + Tensor full_redux; + Tensor full_redux_gpu; in.setRandom(); @@ -39,8 +39,8 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data); + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(); @@ -51,21 +51,21 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template +template static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - int dim_x = 145; - int dim_y = 1; - int dim_z = 67; + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; - array tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array red_axis; + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; red_axis[0] = 0; - array reduced_tensorRange = {{dim_y, dim_z}}; + array reduced_tensorRange = {{dim_y, dim_z}}; - Tensor in(tensorRange); - Tensor redux(reduced_tensorRange); - Tensor redux_gpu(reduced_tensorRange); + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); in.setRandom(); @@ -74,37 +74,37 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j +template static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { - int dim_x = 567; - int dim_y = 1; - int dim_z = 47; + IndexType dim_x = 567; + IndexType dim_y = 1; + IndexType dim_z = 47; - array tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array red_axis; + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; red_axis[0] = 2; - array reduced_tensorRange = {{dim_x, dim_y}}; + array reduced_tensorRange = {{dim_x, dim_y}}; - Tensor in(tensorRange); - Tensor redux(reduced_tensorRange); - Tensor redux_gpu(reduced_tensorRange); + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); in.setRandom(); @@ -113,15 +113,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j void sycl_reduction_test_per_device(const cl::sycl:: QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_sycl(sycl_device); - test_first_dim_reductions_sycl(sycl_device); - test_last_dim_reductions_sycl(sycl_device); - test_full_reductions_sycl(sycl_device); - test_first_dim_reductions_sycl(sycl_device); - test_last_dim_reductions_sycl(sycl_device); + test_full_reductions_sycl(sycl_device); + test_first_dim_reductions_sycl(sycl_device); + test_last_dim_reductions_sycl(sycl_device); + test_full_reductions_sycl(sycl_device); + test_first_dim_reductions_sycl(sycl_device); + test_last_dim_reductions_sycl(sycl_device); } void test_cxx11_tensor_reduction_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 73b394c18..2f5484484 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -14,24 +14,24 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include -template +template static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - int dim1 = 2; - int dim2 = 3; - int dim3 = 5; - int dim4 = 7; + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; - array tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor tensor(tensorRange); - Tensor reversed_tensor(tensorRange); + array tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor tensor(tensorRange); + Tensor reversed_tensor(tensorRange); tensor.setRandom(); array dim_rev; @@ -43,17 +43,17 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =static_cast(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, tensorRange); + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); } } @@ -67,10 +67,10 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); } } @@ -84,10 +84,10 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); } } @@ -100,18 +100,18 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { -template +template static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) { - int dim1 = 2; - int dim2 = 3; - int dim3 = 5; - int dim4 = 7; + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; - array tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor tensor(tensorRange); - Tensor expected(tensorRange); - Tensor result(tensorRange); + array tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor tensor(tensorRange); + Tensor expected(tensorRange); + Tensor result(tensorRange); tensor.setRandom(); array dim_rev; @@ -124,9 +124,9 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue DataType* gpu_out_data_expected =static_cast(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data_result =static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap > out_gpu_result(gpu_out_data_result, tensorRange); + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu_expected(gpu_out_data_expected, tensorRange); + TensorMap > out_gpu_result(gpu_out_data_result, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); @@ -139,20 +139,20 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - array src_slice_dim; + array src_slice_dim; src_slice_dim[0] = 2; src_slice_dim[1] = 3; src_slice_dim[2] = 1; src_slice_dim[3] = 7; - array src_slice_start; + array src_slice_start; src_slice_start[0] = 0; src_slice_start[1] = 0; src_slice_start[2] = 0; src_slice_start[3] = 0; - array dst_slice_dim = src_slice_dim; - array dst_slice_start = src_slice_start; + array dst_slice_dim = src_slice_dim; + array dst_slice_start = src_slice_start; - for (int i = 0; i < 5; ++i) { + for (IndexType i = 0; i < 5; ++i) { if (LValue) { out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); @@ -165,10 +165,10 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue } sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < expected.dimension(0); ++i) { - for (int j = 0; j < expected.dimension(1); ++j) { - for (int k = 0; k < expected.dimension(2); ++k) { - for (int l = 0; l < expected.dimension(3); ++l) { + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); } } @@ -178,7 +178,7 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dst_slice_start[2] = 0; result.setRandom(); sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 5; ++i) { + for (IndexType i = 0; i < 5; ++i) { if (LValue) { out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); @@ -190,10 +190,10 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue } sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - for (int i = 0; i < expected.dimension(0); ++i) { - for (int j = 0; j < expected.dimension(1); ++j) { - for (int k = 0; k < expected.dimension(2); ++k) { - for (int l = 0; l < expected.dimension(3); ++l) { + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); } } @@ -207,12 +207,12 @@ template void sycl_reverse_test_per_device(const cl::sycl::de std::cout << "Running on " << d.template get_info() << std::endl; QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_reverse(sycl_device); - test_simple_reverse(sycl_device); - test_expr_reverse(sycl_device, false); - test_expr_reverse(sycl_device, false); - test_expr_reverse(sycl_device, true); - test_expr_reverse(sycl_device, true); + test_simple_reverse(sycl_device); + test_simple_reverse(sycl_device); + test_expr_reverse(sycl_device, false); + test_expr_reverse(sycl_device, false); + test_expr_reverse(sycl_device, true); + test_expr_reverse(sycl_device, true); } void test_cxx11_tensor_reverse_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index c4521aac8..c88db7c72 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL @@ -28,20 +28,20 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template +template static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { - IndexTypes sizeDim1 = 2; - IndexTypes sizeDim2 = 3; - IndexTypes sizeDim3 = 5; - IndexTypes sizeDim4 = 7; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor tensor(tensorRange); - Tensor no_shuffle(tensorRange); + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor tensor(tensorRange); + Tensor no_shuffle(tensorRange); tensor.setRandom(); const size_t buffSize =tensor.size()*sizeof(DataType); - array shuffles; + array shuffles; shuffles[0] = 0; shuffles[1] = 1; shuffles[2] = 2; @@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) DataType* gpu_data2 = static_cast(sycl_device.allocate(buffSize)); - TensorMap> gpu1(gpu_data1, tensorRange); - TensorMap> gpu2(gpu_data2, tensorRange); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); @@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3); VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); } } @@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - array tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor shuffle(tensorrangeShuffle); + array tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor shuffle(tensorrangeShuffle); DataType* gpu_data3 = static_cast(sycl_device.allocate(buffSize)); - TensorMap> gpu3(gpu_data3, tensorrangeShuffle); + TensorMap> gpu3(gpu_data3, tensorrangeShuffle); gpu3.device(sycl_device)=gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); @@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2); VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); } } @@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) template void sycl_shuffling_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_shuffling_sycl(sycl_device); - test_simple_shuffling_sycl(sycl_device); - test_simple_shuffling_sycl(sycl_device); test_simple_shuffling_sycl(sycl_device); diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp index 2cbb18f1c..603c3746f 100644 --- a/unsupported/test/cxx11_tensor_striding_sycl.cpp +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include @@ -72,10 +72,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(no_stride.dimension(2), 5); VERIFY_IS_EQUAL(no_stride.dimension(3), 7); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); } } @@ -97,10 +97,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device) VERIFY_IS_EQUAL(stride.dimension(2), 3); VERIFY_IS_EQUAL(stride.dimension(3), 3); - for (int i = 0; i < 1; ++i) { - for (int j = 0; j < 1; ++j) { - for (int k = 0; k < 3; ++k) { - for (int l = 0; l < 3; ++l) { + for (IndexType i = 0; i < 1; ++i) { + for (IndexType j = 0; j < 1; ++j) { + for (IndexType k = 0; k < 3; ++k) { + for (IndexType l = 0; l < 3; ++l) { VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); } } @@ -151,10 +151,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); } } @@ -172,10 +172,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); } } @@ -190,10 +190,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) template void tensorStridingPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_simple_striding(sycl_device); - test_simple_striding(sycl_device); - test_striding_as_lvalue(sycl_device); - test_striding_as_lvalue(sycl_device); + test_simple_striding(sycl_device); + test_simple_striding(sycl_device); + test_striding_as_lvalue(sycl_device); + test_striding_as_lvalue(sycl_device); } void test_cxx11_tensor_striding_sycl() { diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 6f7e29890..5cd0f4c71 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -27,24 +27,24 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template +template void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor in1(tensorRange); - Tensor out1(tensorRange); - Tensor out2(tensorRange); - Tensor out3(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor in1(tensorRange); + Tensor out1(tensorRange); + Tensor out2(tensorRange); + Tensor out3(tensorRange); in1 = in1.random(); DataType* gpu_data1 = static_cast(sycl_device.allocate(in1.size()*sizeof(DataType))); DataType* gpu_data2 = static_cast(sycl_device.allocate(out1.size()*sizeof(DataType))); - TensorMap> gpu1(gpu_data1, tensorRange); - TensorMap> gpu2(gpu_data2, tensorRange); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); @@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); @@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_data2); } -template +template void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { - int size = 20; - array tensorRange = {{size}}; - Tensor in1(tensorRange); - Tensor in2(tensorRange); - Tensor out(tensorRange); + IndexType size = 20; + array tensorRange = {{size}}; + Tensor in1(tensorRange); + Tensor in2(tensorRange); + Tensor out(tensorRange); in1 = in1.random(); in2 = in1; DataType* gpu_data = static_cast(sycl_device.allocate(in1.size()*sizeof(DataType))); - TensorMap> gpu1(gpu_data, tensorRange); + TensorMap> gpu1(gpu_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); sycl_device.synchronize(); in1.setZero(); @@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < in1.size(); ++i) { + for (IndexType i = 0; i < in1.size(); ++i) { VERIFY_IS_APPROX(out(i), in2(i)); } sycl_device.deallocate(gpu_data); } -template +template void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 10; - int sizeDim3 = 20; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor in1(tensorRange); - Tensor in2(tensorRange); - Tensor in3(tensorRange); - Tensor out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor in1(tensorRange); + Tensor in2(tensorRange); + Tensor in3(tensorRange); + Tensor out(tensorRange); in2 = in2.random(); in3 = in3.random(); @@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { DataType * gpu_in3_data = static_cast(sycl_device.allocate(in3.size()*sizeof(DataType))); DataType * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(DataType))); - TensorMap> gpu_in1(gpu_in1_data, tensorRange); - TensorMap> gpu_in2(gpu_in2_data, tensorRange); - TensorMap> gpu_in3(gpu_in3_data, tensorRange); - TensorMap> gpu_out(gpu_out_data, tensorRange); + TensorMap> gpu_in1(gpu_in1_data, tensorRange); + TensorMap> gpu_in2(gpu_in2_data, tensorRange); + TensorMap> gpu_in3(gpu_in3_data, tensorRange); + TensorMap> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(in1(i,j,k), 1.2f); } } @@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 1.2f); } @@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in2(i,j,k)); @@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k)); @@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in1(i,j,k)); @@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 3.14f + in2(i,j,k) * 2.7f); @@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); sycl_device.synchronize(); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k)); @@ -229,26 +229,26 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } -template +template static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ - int size = 20; - array tensorRange = {{size}}; - Tensor in(tensorRange); - Tensor out(tensorRange); - Tensor out_host(tensorRange); + IndexType size = 20; + array tensorRange = {{size}}; + Tensor in(tensorRange); + Tensor out(tensorRange); + Tensor out_host(tensorRange); in = in.random(); Scalar1* gpu_in_data = static_cast(sycl_device.allocate(in.size()*sizeof(Scalar1))); Scalar2 * gpu_out_data = static_cast(sycl_device.allocate(out.size()*sizeof(Scalar2))); - TensorMap> gpu_in(gpu_in_data, tensorRange); - TensorMap> gpu_out(gpu_out_data, tensorRange); + TensorMap> gpu_in(gpu_in_data, tensorRange); + TensorMap> gpu_out(gpu_out_data, tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); gpu_out.device(sycl_device) = gpu_in. template cast(); sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); out_host = in. template cast(); - for(int i=0; i< size; i++) + for(IndexType i=0; i< size; i++) { VERIFY_IS_APPROX(out(i), out_host(i)); } @@ -259,14 +259,14 @@ static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ template void sycl_computing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_sycl_mem_transfers(sycl_device); - test_sycl_computations(sycl_device); - test_sycl_mem_sync(sycl_device); - test_sycl_mem_transfers(sycl_device); - test_sycl_computations(sycl_device); - test_sycl_mem_sync(sycl_device); - test_sycl_cast(sycl_device); - test_sycl_cast(sycl_device); + test_sycl_mem_transfers(sycl_device); + test_sycl_computations(sycl_device); + test_sycl_mem_sync(sycl_device); + test_sycl_mem_transfers(sycl_device); + test_sycl_computations(sycl_device); + test_sycl_mem_sync(sycl_device); + test_sycl_cast(sycl_device); + test_sycl_cast(sycl_device); } void test_cxx11_tensor_sycl() { From ff530500344e63d4db39090d268a24852bffdea4 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 1 Feb 2017 15:36:03 +0000 Subject: [PATCH 19/54] Converting ptrdiff_t type to int64_t type in cxx11_tensor_contract_sycl.cpp in order to be the same as other tests. --- .../test/cxx11_tensor_contract_sycl.cpp | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index 41acd5579..5bace66c5 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -254,26 +254,26 @@ void test_sycl_contraction_sizes(const Device& sycl_device) { template void tensorContractionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); + test_sycl_contraction(sycl_device, 32, 32, 32); + test_sycl_contraction(sycl_device, 32, 32, 32); + test_scalar(sycl_device, 32, 32, 32); + test_scalar(sycl_device, 32, 32, 32); std::chrono::time_point start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_TF(sycl_device); - test_TF(sycl_device); + test_sycl_contraction(sycl_device, 128, 128, 128); + test_sycl_contraction(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 128, 128, 128); + test_sycl_contraction_m(sycl_device); + test_sycl_contraction_m(sycl_device); + test_sycl_contraction_n(sycl_device); + test_sycl_contraction_n(sycl_device); + test_sycl_contraction_k(sycl_device); + test_sycl_contraction_k(sycl_device); + test_sycl_contraction_sizes(sycl_device); + test_sycl_contraction_sizes(sycl_device); + test_TF(sycl_device); + test_TF(sycl_device); end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; From 0eceea4efd47bb2a1bfb72903fbd14a5d32c5ced Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 1 Feb 2017 23:36:40 +0100 Subject: [PATCH 20/54] Define EIGEN_COMP_GNUC to reflect version number: 47, 48, 49, 50, 60, ... --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index ab0550895..5db9e4fe5 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -23,7 +23,7 @@ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC 1 + #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) #else #define EIGEN_COMP_GNUC 0 #endif From 84090027c49638cdc0025ef1baba1855bcbcd858 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 1 Feb 2017 23:37:44 +0100 Subject: [PATCH 21/54] Disable a part of the unit test for gcc 4.8 --- test/indexed_view.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 909d2351d..86342dc0a 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -297,7 +297,7 @@ void check_indexed_view() VERIFY_IS_APPROX( (A(std::array{{1,3,5}}, std::array{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); -#if (!EIGEN_COMP_CLANG) || (EIGEN_COMP_CLANG>=308 && !defined(__apple_build_version__)) +#if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array{{3, 1, 6, 5}}, all) ); VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array{{3, 1, 6, 5}}) ); VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array{{1,3,5}},std::array{{3, 1, 6, 5}}) ); From fcd257039b7fba59d3c968f62c7e7d0f37cbaf3b Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Feb 2017 15:30:49 -0800 Subject: [PATCH 22/54] Replaced EIGEN_DEVICE_FUNC template with template EIGEN_DEVICE_FUNC to make the code compile with nvcc8. --- Eigen/src/plugins/BlockMethods.h | 46 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 2d5a4e507..c116f0e0f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -78,8 +78,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa class Block, fix, fix(int) /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -92,8 +92,8 @@ block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) } /// This is the const version of block(Index,Index,NRowsType,NColsType) -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -124,8 +124,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -138,8 +138,8 @@ topRightCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of topRightCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -229,8 +229,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -243,8 +243,8 @@ topLeftCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of topLeftCorner(Index, Index). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -333,8 +333,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -348,8 +348,8 @@ bottomRightCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of bottomRightCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -454,8 +454,8 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols) } /// This is the const version of bottomLeftCorner(NRowsType, NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -558,8 +558,8 @@ topRows(NRowsType n) } /// This is the const version of topRows(NRowsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr::value>::Type #else @@ -619,8 +619,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr::value>::Type #else @@ -633,8 +633,8 @@ bottomRows(NRowsType n) } /// This is the const version of bottomRows(NRowsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr::value>::Type #else @@ -709,8 +709,8 @@ middleRows(Index startRow, NRowsType n) } /// This is the const version of middleRows(Index,NRowsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNRowsBlockXpr::value>::Type #else @@ -771,8 +771,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr::value>::Type #else @@ -785,8 +785,8 @@ leftCols(NColsType n) } /// This is the const version of leftCols(NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr::value>::Type #else @@ -846,8 +846,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr::value>::Type #else @@ -922,8 +922,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NColsBlockXpr::value>::Type #else @@ -936,8 +936,8 @@ middleCols(Index startCol, NColsType numCols) } /// This is the const version of middleCols(Index,NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr::value>::Type #else @@ -1130,8 +1130,8 @@ inline ConstRowXpr row(Index i) const /// /// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType::value>::Type #else @@ -1146,8 +1146,8 @@ segment(Index start, NType n) /// This is the const version of segment(Index,NType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType::value>::Type #else @@ -1180,8 +1180,8 @@ segment(Index start, NType n) const /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType::value>::Type #else @@ -1195,8 +1195,8 @@ head(NType n) } /// This is the const version of head(NType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType::value>::Type #else @@ -1244,8 +1244,8 @@ tail(NType n) } /// This is the const version of tail(Index). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstFixedSegmentReturnType::value>::Type #else From 2db75c07a608ab07fbbdd6a3215e39c7e7943445 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Feb 2017 15:41:29 -0800 Subject: [PATCH 23/54] fixed the ordering of the template and EIGEN_DEVICE_FUNC keywords in a few more places to get more of the Eigen codebase to compile with nvcc again. --- Eigen/src/plugins/BlockMethods.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index c116f0e0f..5caf14469 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -439,8 +439,8 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type #else @@ -544,8 +544,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr::value>::Type #else @@ -695,8 +695,8 @@ EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// /// \sa block(Index,Index,NRowsType,NColsType), class Block /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename NRowsBlockXpr::value>::Type #else @@ -860,8 +860,8 @@ rightCols(NColsType n) } /// This is the const version of rightCols(NColsType). -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline const typename ConstNColsBlockXpr::value>::Type #else @@ -1229,8 +1229,8 @@ head(NType n) const /// /// \sa class Block, block(Index,Index) /// -EIGEN_DEVICE_FUNC template +EIGEN_DEVICE_FUNC #ifndef EIGEN_PARSED_BY_DOXYGEN inline typename FixedSegmentReturnType::value>::Type #else From 442e9cbb307ece9225a061a5661909a47737585e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Feb 2017 15:50:58 -0800 Subject: [PATCH 24/54] Silenced several compilation warnings --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 442c14fac..d32c20b5e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -64,9 +64,9 @@ void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index template struct libxsmm_wrapper { libxsmm_wrapper() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) {} - void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c) {} - void operator()(const LhsScalar* a, const RhsScalar* b, Scalar* c, const LhsScalar* ap, const RhsScalar* bp, const Scalar* cp) {} + libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} + void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} + void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} }; template<> @@ -682,7 +682,9 @@ protected: } m_can_use_xsmm = true; - #endif +#else + EIGEN_UNUSED_VARIABLE(eval_op_indices); +#endif } #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) From bc128f9f3beff5a2a3225f03ec2e5eb111a15b87 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 2 Feb 2017 10:43:47 +0000 Subject: [PATCH 25/54] Reducing the warnings in Sycl backend. --- unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index c3e095b8a..4247c1c4a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -66,7 +66,7 @@ EigenConvolutionKernel1D(internal::IndexMapper(0); const size_t index = plane_kernel_offset+ itemID.get_local(0); From 4254b3eda34346a28518f6b2b6a8ff8c8368d3d3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 3 Feb 2017 15:22:35 +0100 Subject: [PATCH 26/54] bug #1389: MSVC's std containers do not properly align in 64 bits mode if the requested alignment is larger than 16 bytes (e.g., with AVX) --- Eigen/StdDeque | 2 +- Eigen/StdList | 2 +- Eigen/StdVector | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/StdDeque b/Eigen/StdDeque index be3a7f82b..bc68397be 100644 --- a/Eigen/StdDeque +++ b/Eigen/StdDeque @@ -14,7 +14,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) diff --git a/Eigen/StdList b/Eigen/StdList index 07ba1297b..4c6262c08 100644 --- a/Eigen/StdList +++ b/Eigen/StdList @@ -13,7 +13,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) diff --git a/Eigen/StdVector b/Eigen/StdVector index fdfc37766..0c4697ad5 100644 --- a/Eigen/StdVector +++ b/Eigen/StdVector @@ -14,7 +14,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) From 42bd5c4e7b8f4b5875ae256e7ac20310161d8470 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 6 Feb 2017 18:05:23 +0000 Subject: [PATCH 27/54] Fixing TensorReductionSycl for min and max. --- .../CXX11/src/Tensor/TensorReductionSycl.h | 10 +++---- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 2 +- .../CXX11/src/Tensor/TensorSyclFunctors.h | 30 ++++++++++++++----- .../test/cxx11_tensor_reduction_sycl.cpp | 9 +++--- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 8ecef59a8..9dcb42904 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -26,10 +26,10 @@ namespace Eigen { namespace internal { template struct syclGenericBufferReducer{ -template -static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +template +static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, @@ -43,7 +43,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer< CoeffReturnType, OutputAccessor, InputAccessor, LocalAccessor>(aOut, aI, scratch, length, local)); + h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, aI, scratch, length, local)); }; dev.sycl_queue().submit(f); dev.asynchronousExec(); @@ -123,7 +123,7 @@ struct FullReducer { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 84f660597..9d5a6d4c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -35,7 +35,7 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template struct GenericKernelReducer; + template struct GenericKernelReducer; /// This struct is used for special expression nodes with no operations (for example assign and selectOP). diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index 710e22474..a77f408de 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -18,13 +18,14 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template struct GenericKernelReducer{ + template struct GenericKernelReducer{ + OP op; OutputAccessor aOut; InputAccessor aI; LocalAccessor scratch; size_t length, local; - GenericKernelReducer(OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} + GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) + : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} void operator()(cl::sycl::nd_item<1> itemID) { size_t globalid = itemID.get_global(0); size_t localid = itemID.get_local(0); @@ -44,7 +45,12 @@ namespace internal { auto min = (length < local) ? length : local; for (size_t offset = min / 2; offset > 0; offset /= 2) { if (localid < offset) { - scratch[localid] += scratch[localid + offset]; + auto accum = op.initialize(); + op.reduce(scratch[localid], &accum); + op.reduce(scratch[localid + offset], &accum); + op.finalize(accum); + scratch[localid]=accum; + //scratch[localid] += scratch[localid + offset]; } itemID.barrier(cl::sycl::access::fence_space::local_space); } @@ -131,11 +137,21 @@ public: if(globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(op)); else - tmp_global_accessor.get_pointer()[globalid]=static_cast(0); + tmp_global_accessor.get_pointer()[globalid]=static_cast(op.initialize()); - if(remaining!=0 && globalid==0 ) + if(remaining!=0 && globalid==0 ){ // this will add the rest of input buffer when the input size is not devidable to red_factor. - tmp_global_accessor.get_pointer()[0]+=Eigen::internal::InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(op)); + // tmp_global_accessor.get_pointer()[0]+= + auto remaining_reduce =Eigen::internal::InnerMostDimReducer:: + reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(op)); + auto accum = op.initialize(); + op.reduce(tmp_global_accessor.get_pointer()[0], &accum); + op.reduce(remaining_reduce, &accum); + op.finalize(accum); + tmp_global_accessor.get_pointer()[0]=accum; + + + } } }; diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 98a59a14c..251091f5b 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -34,7 +34,7 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { in.setRandom(); - full_redux = in.sum(); + full_redux = in.minimum(); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); @@ -43,11 +43,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { TensorMap > out_gpu(gpu_out_data); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.sum(); + out_gpu.device(sycl_device) = in_gpu.minimum(); sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); - sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } @@ -69,7 +68,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) in.setRandom(); - redux= in.sum(red_axis); + redux= in.maximum(red_axis); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); @@ -78,7 +77,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. From 0ee97b60c256b31a98838324ce1909247a0133d2 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Tue, 7 Feb 2017 15:43:17 +0000 Subject: [PATCH 28/54] Adding mean to TensorReductionSycl.h --- .../CXX11/src/Tensor/TensorReductionSycl.h | 23 +++-- .../CXX11/src/Tensor/TensorSyclFunctors.h | 98 +++++++++++++++++-- .../test/cxx11_tensor_reduction_sycl.cpp | 53 ++++++++-- 3 files changed, 148 insertions(+), 26 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 9dcb42904..c3ca129e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,8 +25,8 @@ namespace Eigen { namespace internal { -template struct syclGenericBufferReducer{ -template +template struct syclGenericBufferReducer{ +template static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { @@ -54,13 +54,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev length = length / local; } while (length > 1); - - - } }; +template struct syclGenericBufferReducer, CoeffReturnType>{ +template +static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ + syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), + bufOut, bufI, dev, length, local); +} +}; + /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as @@ -123,7 +128,7 @@ struct FullReducer { // getting final out buffer at the moment the created buffer is true because there is no need for assign auto out_buffer =dev.get_sycl_buffer(output); /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); + syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; @@ -135,7 +140,7 @@ struct InnerReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { + static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); @@ -153,10 +158,10 @@ struct InnerReducer { // create a tuple of accessors from Evaluator Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto output_accessor = dev.template get_sycl_accessor(cgh, output); - + Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), TensorSycl::internal::ReductionFunctor - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); + (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); }); dev.asynchronousExec(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h index a77f408de..2f7779036 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h @@ -72,7 +72,7 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen public: typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; typedef cl::sycl::accessor write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_) + ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} void operator()(cl::sycl::nd_item<1> itemID) { @@ -105,6 +105,46 @@ template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typen Index range; }; +template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index> +class ReductionFunctor, Index> { + public: + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + typedef cl::sycl::accessor write_accessor; + typedef Eigen::internal::SumReducer Op; + ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, + Eigen::internal::MeanReducer, Index range_, Index num_values_to_reduce_) + :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename ConvertToDeviceExpression::Type DevExpr; + auto device_expr = createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + typedef Eigen::TensorEvaluator DeviceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=static_cast(itemID.get_global_linear_id()); + if (globalid< range) { + typename DeviceSelf::CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); + functor.finalize(accum); + output_accessor_ptr[globalid]= accum/num_values_to_reduce; + } + } + private: + write_accessor output_accessor; + FunctorExpr functors; + Tuple_of_Acc tuple_of_accessors; + Dims dims; + Op functor; + Index range; + Index num_values_to_reduce; +}; template class FullReductionKernelFunctor{ @@ -134,14 +174,11 @@ public: /// const cast added as a naive solution to solve the qualifier drop error auto globalid=itemID.get_global_linear_id(); - if(globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(op)); - else - tmp_global_accessor.get_pointer()[globalid]=static_cast(op.initialize()); + tmp_global_accessor.get_pointer()[globalid]=(globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(op)) + : static_cast(op.initialize()); if(remaining!=0 && globalid==0 ){ // this will add the rest of input buffer when the input size is not devidable to red_factor. - // tmp_global_accessor.get_pointer()[0]+= auto remaining_reduce =Eigen::internal::InnerMostDimReducer:: reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(op)); auto accum = op.initialize(); @@ -150,13 +187,58 @@ public: op.finalize(accum); tmp_global_accessor.get_pointer()[0]=accum; + } + } +}; + +template +class FullReductionKernelFunctor, Dims, Index, TupleType>{ +public: + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + typedef Eigen::internal::SumReducer Op; + + OutAccessor tmp_global_accessor; + Index rng , remaining, red_factor; + Op op; + Dims dims; + FunctorExpr functors; + TupleType tuple_of_accessors; + + FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer, Dims dims_, FunctorExpr functors_, TupleType t_acc) + :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} + + void operator()(cl::sycl::nd_item<1> itemID) { + + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= Eigen::TensorReductionOp(device_expr.expr, dims, op); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + auto scale = (rng*red_factor) + remaining; + + tmp_global_accessor.get_pointer()[globalid]= (globalid::reduce(device_self_evaluator, static_cast(red_factor*globalid), red_factor, const_cast(op)))/scale) + :static_cast(op.initialize())/scale; + + if(remaining!=0 && globalid==0 ){ + // this will add the rest of input buffer when the input size is not devidable to red_factor. + auto remaining_reduce =Eigen::internal::InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(op)); + auto accum = op.initialize(); + tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale; + op.reduce(tmp_global_accessor.get_pointer()[0], &accum); + op.reduce(remaining_reduce, &accum); + op.finalize(accum); + tmp_global_accessor.get_pointer()[0]=accum/scale; } } }; - - } } } diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 251091f5b..440d48bca 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -22,7 +22,7 @@ template -static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 452; const IndexType num_cols = 765; @@ -34,6 +34,37 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { in.setRandom(); + full_redux = in.mean(); + + DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap > in_gpu(gpu_in_data, tensorRange); + TensorMap > out_gpu(gpu_out_data); + + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + + +template +static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { + + const IndexType num_rows = 876; + const IndexType num_cols = 953; + array tensorRange = {{num_rows, num_cols}}; + + Tensor in(tensorRange); + Tensor full_redux; + Tensor full_redux_gpu; + + in.setRandom(); + full_redux = in.minimum(); DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); @@ -50,8 +81,10 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } + + template -static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; @@ -90,7 +123,7 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) } template -static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { IndexType dim_x = 567; IndexType dim_y = 1; @@ -132,12 +165,14 @@ template void sycl_reduction_test_per_device(const cl::sycl:: QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_sycl(sycl_device); - test_first_dim_reductions_sycl(sycl_device); - test_last_dim_reductions_sycl(sycl_device); - test_full_reductions_sycl(sycl_device); - test_first_dim_reductions_sycl(sycl_device); - test_last_dim_reductions_sycl(sycl_device); + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_min_sycl(sycl_device); + test_first_dim_reductions_max_sycl(sycl_device); + test_last_dim_reductions_sum_sycl(sycl_device); + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_min_sycl(sycl_device); + test_first_dim_reductions_max_sycl(sycl_device); + test_last_dim_reductions_sum_sycl(sycl_device); } void test_cxx11_tensor_reduction_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { From fc8fd5fd24d3dce28b7fafa538b67e61dd667f6e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 7 Feb 2017 17:19:59 +0100 Subject: [PATCH 29/54] Improve multi-threading heuristic for matrix products with a small number of columns. --- Eigen/src/Core/products/Parallelizer.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 3477d7182..c2f084c82 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -104,13 +104,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, // - the sizes are large enough // compute the maximal number of threads from the size of the product: - // FIXME this has to be fine tuned + // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once. Index size = transpose ? rows : cols; - Index pb_max_threads = std::max(1,size / 32); + Index pb_max_threads = std::max(1,size / Functor::Traits::nr); + // compute the maximal number of threads from the total amount of work: double work = static_cast(rows) * static_cast(cols) * static_cast(depth); - double kMinTaskSize = 50000; // Heuristic. + double kMinTaskSize = 50000; // FIXME improve this heuristic. pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); // compute the number of threads we are going to use From dd58462e63b0738842da5e509558ea12cabecee2 Mon Sep 17 00:00:00 2001 From: Alexander Neumann Date: Wed, 8 Feb 2017 23:50:38 +0100 Subject: [PATCH 30/54] fixed inlining issue with clang-cl on visual studio (grafted from 7962ac1a5855e8b7a60d5d90e61365b71f5501a5 ) --- Eigen/src/Core/AssignEvaluator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 489935b83..b0ec7b7ca 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -515,7 +515,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; @@ -563,7 +563,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; From 0256c52359281f6685532ba8f1d517fbb91b46c6 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Feb 2017 13:41:52 +0100 Subject: [PATCH 31/54] Include clang in the list of non strict MSVC (just to be sure) --- Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 5db9e4fe5..bc033959c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -80,8 +80,8 @@ // 2015 14 1900 // "15" 15 1900 -/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC -#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) +/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl +#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 From a1ff24f96a1280cd7d7395f739d8f265150879bb Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 10 Feb 2017 13:59:32 +0100 Subject: [PATCH 32/54] Fix prunning in (sparse*sparse).pruned() when the result is nearly dense. --- Eigen/src/SparseCore/AmbiVector.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 1233e164e..8a5cc91f2 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -336,7 +336,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator { do { ++m_cachedIndex; - } while (m_cachedIndex::Iterator ListEl* EIGEN_RESTRICT llElements = reinterpret_cast(m_vector.m_buffer); do { m_currentEl = llElements[m_currentEl].next; - } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)=0 && abs(llElements[m_currentEl].value)<=m_epsilon); if (m_currentEl<0) { m_cachedIndex = -1; @@ -363,9 +363,9 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator protected: const AmbiVector& m_vector; // the target vector - StorageIndex m_currentEl; // the current element in sparse/linked-list mode + StorageIndex m_currentEl; // the current element in sparse/linked-list mode RealScalar m_epsilon; // epsilon used to prune zero coefficients - StorageIndex m_cachedIndex; // current coordinate + StorageIndex m_cachedIndex; // current coordinate Scalar m_cachedValue; // current value bool m_isDense; // mode of the vector }; From 8b3cc54c42d6f2cc7db6f2a56da0e6510782b747 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Feb 2017 13:08:49 -0800 Subject: [PATCH 33/54] Added a new EIGEN_HAS_INDEXED_VIEW define that set to 0 for older compilers that are known to fail to compile the indexed views (I used the define from the indexed_views.cpp test). Only include the indexed view methods when the compiler supports the code. This makes it possible to use Eigen again in complex code bases such as TensorFlow and older compilers such as gcc 4.8 --- Eigen/src/Core/util/Macros.h | 6 ++++++ Eigen/src/plugins/IndexedViewMethods.h | 5 ++--- test/indexed_view.cpp | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index bc033959c..0e2863306 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -349,6 +349,12 @@ # define __has_feature(x) 0 #endif +#if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) +#define EIGEN_HAS_INDEXED_VIEW 1 +#else +#define EIGEN_HAS_INDEXED_VIEW 0 +#endif + // Upperbound on the C++ version to use. // Expected values are 03, 11, 14, 17, etc. // By default, let's use an arbitrarily large C++ version. diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index b2cc2944a..5e28ec71c 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -7,7 +7,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PARSED_BY_DOXYGEN +#if !defined(EIGEN_PARSED_BY_DOXYGEN) && EIGEN_HAS_INDEXED_VIEW // This file is automatically included twice to generate const and non-const versions @@ -256,5 +256,4 @@ template IndexedView_or_VectorBlock operator()(const Indices& indices); -#endif // EIGEN_PARSED_BY_DOXYGEN - +#endif // EIGEN_PARSED_BY_DOXYGEN && EIGEN_HAS_INDEXED_VIEW diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 86342dc0a..4cbc00639 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -79,6 +79,7 @@ is_same_seq_type(const T1& a, const T2& b) void check_indexed_view() { +#if EIGEN_HAS_INDEXED_VIEW using Eigen::placeholders::all; using Eigen::placeholders::last; using Eigen::placeholders::end; @@ -297,7 +298,6 @@ void check_indexed_view() VERIFY_IS_APPROX( (A(std::array{{1,3,5}}, std::array{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); -#if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array{{3, 1, 6, 5}}, all) ); VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array{{3, 1, 6, 5}}) ); VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array{{1,3,5}},std::array{{3, 1, 6, 5}}) ); @@ -310,7 +310,6 @@ void check_indexed_view() VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array{{3, 1, 6, 5}}) ); VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 ); -#endif #endif @@ -366,6 +365,7 @@ void check_indexed_view() VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) ); } +#endif // EIGEN_HAS_INDEXED_VIEW } void test_indexed_view() From 1ef30b8090ad56bac5d1b0553a2b0a894a77405e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 10 Feb 2017 13:35:10 -0800 Subject: [PATCH 34/54] Fixed bug introduced in previous commit --- Eigen/src/plugins/IndexedViewMethods.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 5e28ec71c..81e463623 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -7,7 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if !defined(EIGEN_PARSED_BY_DOXYGEN) && EIGEN_HAS_INDEXED_VIEW +#if EIGEN_HAS_INDEXED_VIEW +#if !defined(EIGEN_PARSED_BY_DOXYGEN) // This file is automatically included twice to generate const and non-const versions @@ -256,4 +257,5 @@ template IndexedView_or_VectorBlock operator()(const Indices& indices); -#endif // EIGEN_PARSED_BY_DOXYGEN && EIGEN_HAS_INDEXED_VIEW +#endif // EIGEN_PARSED_BY_DOXYGEN +#endif // EIGEN_HAS_INDEXED_VIEW From 4a4a72951fc6175e7e5ee3bcdc96a2c8b7160e68 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Feb 2017 10:28:44 +0100 Subject: [PATCH 35/54] Fix previous commits: disbale only problematic indexed view methods for old compilers instead of disabling everything. Tested with gcc 4.7 (c++03) and gcc 4.8 (c++03 & c++11) --- Eigen/src/plugins/IndexedViewMethods.h | 10 ++++++++-- test/indexed_view.cpp | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 81e463623..22c1666c5 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -7,7 +7,6 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if EIGEN_HAS_INDEXED_VIEW #if !defined(EIGEN_PARSED_BY_DOXYGEN) // This file is automatically included twice to generate const and non-const versions @@ -113,6 +112,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); } +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + // The folowing three overloads are needed to handle raw Index[N] arrays. template @@ -139,6 +140,8 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col (derived(), rowIndices, colIndices); } +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + // Overloads for 1D vectors/arrays template @@ -182,6 +185,8 @@ operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST return Base::operator()(internal::eval_expr_given_size(id,size())); } +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE + template typename internal::enable_if >::type @@ -202,6 +207,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST (derived(), indices, IvcIndex(0)); } +#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE + #undef EIGEN_INDEXED_VIEW_METHOD_CONST #undef EIGEN_INDEXED_VIEW_METHOD_TYPE @@ -258,4 +265,3 @@ IndexedView_or_VectorBlock operator()(const Indices& indices); #endif // EIGEN_PARSED_BY_DOXYGEN -#endif // EIGEN_HAS_INDEXED_VIEW diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 4cbc00639..7245cf378 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -79,7 +79,6 @@ is_same_seq_type(const T1& a, const T2& b) void check_indexed_view() { -#if EIGEN_HAS_INDEXED_VIEW using Eigen::placeholders::all; using Eigen::placeholders::last; using Eigen::placeholders::end; @@ -298,6 +297,7 @@ void check_indexed_view() VERIFY_IS_APPROX( (A(std::array{{1,3,5}}, std::array{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); +#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE VERIFY_IS_APPROX( A({3, 1, 6, 5}, all), A(std::array{{3, 1, 6, 5}}, all) ); VERIFY_IS_APPROX( A(all,{3, 1, 6, 5}), A(all,std::array{{3, 1, 6, 5}}) ); VERIFY_IS_APPROX( A({1,3,5},{3, 1, 6, 5}), A(std::array{{1,3,5}},std::array{{3, 1, 6, 5}}) ); @@ -310,6 +310,7 @@ void check_indexed_view() VERIFY_IS_APPROX( b({3, 1, 6, 5}), b(std::array{{3, 1, 6, 5}}) ); VERIFY_IS_EQUAL( b({1,3,5}).SizeAtCompileTime, 3 ); +#endif #endif @@ -365,7 +366,6 @@ void check_indexed_view() VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) ); } -#endif // EIGEN_HAS_INDEXED_VIEW } void test_indexed_view() From 6486d4fc959a91743f9330e460c13ee4b2e10723 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Feb 2017 10:29:10 +0100 Subject: [PATCH 36/54] Worakound gcc 4.7 issue in c++11. --- Eigen/src/Core/ArithmeticSequence.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h index 99b954432..ada1571f1 100644 --- a/Eigen/src/Core/ArithmeticSequence.h +++ b/Eigen/src/Core/ArithmeticSequence.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -#if !EIGEN_HAS_CXX11 +#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) template struct aseq_negate {}; template<> struct aseq_negate { @@ -138,7 +138,7 @@ protected: public: -#if EIGEN_HAS_CXX11 +#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); } From e43016367af7b2a4b27c9a1633eee52d6c00dc76 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Feb 2017 10:34:18 +0100 Subject: [PATCH 37/54] Forgot to include a file in previous commit --- Eigen/src/Core/util/Macros.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 0e2863306..12531e342 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -349,10 +349,12 @@ # define __has_feature(x) 0 #endif +// Some old compilers do not support template specializations like: +// template void foo(const T x[N]); #if !( EIGEN_COMP_CLANG && ((EIGEN_COMP_CLANG<309) || defined(__apple_build_version__)) || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) -#define EIGEN_HAS_INDEXED_VIEW 1 +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 #else -#define EIGEN_HAS_INDEXED_VIEW 0 +#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 #endif // Upperbound on the C++ version to use. From c16ee72b2093ae635b1f07ffd95626c3a86dbbfe Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Feb 2017 10:35:01 +0100 Subject: [PATCH 38/54] bug #1392: fix #include with mpl2-only --- Eigen/Sparse | 2 ++ test/mpl2only.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Eigen/Sparse b/Eigen/Sparse index a2ef7a665..136e681a1 100644 --- a/Eigen/Sparse +++ b/Eigen/Sparse @@ -25,7 +25,9 @@ #include "SparseCore" #include "OrderingMethods" +#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" +#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/test/mpl2only.cpp b/test/mpl2only.cpp index 5ef0d2b2e..7d04d6bba 100644 --- a/test/mpl2only.cpp +++ b/test/mpl2only.cpp @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include int main() { From b3750990d551bc55949c0312a68233e99fc7961a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 11 Feb 2017 23:24:06 +0100 Subject: [PATCH 39/54] Workaround some gcc 4.7 warnings --- Eigen/src/Core/IndexedView.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 63878428e..8c57a277c 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -19,8 +19,8 @@ struct traits > : traits { enum { - RowsAtCompileTime = array_size::value, - ColsAtCompileTime = array_size::value, + RowsAtCompileTime = int(array_size::value), + ColsAtCompileTime = int(array_size::value), MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : int(traits::MaxRowsAtCompileTime), MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : int(traits::MaxColsAtCompileTime), @@ -29,8 +29,8 @@ struct traits > : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 : XprTypeIsRowMajor, - RowIncr = get_compile_time_incr::value, - ColIncr = get_compile_time_incr::value, + RowIncr = int(get_compile_time_incr::value), + ColIncr = int(get_compile_time_incr::value), InnerIncr = IsRowMajor ? ColIncr : RowIncr, OuterIncr = IsRowMajor ? RowIncr : ColIncr, @@ -51,7 +51,7 @@ struct traits > // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, // but this is too strict regarding negative strides... - DirectAccessMask = (InnerIncr!=UndefinedIncr && OuterIncr!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, + DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, Flags = (traits::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit From e7ebe52bfb4b0653e69217d9beac75ca7949e165 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 13 Feb 2017 09:46:20 +0100 Subject: [PATCH 40/54] bug #1391: include IO.h before DenseBase to enable its usage in DenseBase plugins. --- Eigen/Core | 2 +- Eigen/src/Core/DenseBase.h | 12 +++++++++++- Eigen/src/Core/IO.h | 14 -------------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 9f1c63826..d18835613 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -432,6 +432,7 @@ using std::ptrdiff_t; #include "src/Core/util/IndexedViewHelper.h" #include "src/Core/ArithmeticSequence.h" +#include "src/Core/IO.h" #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -480,7 +481,6 @@ using std::ptrdiff_t; #include "src/Core/Redux.h" #include "src/Core/Visitor.h" #include "src/Core/Fuzzy.h" -#include "src/Core/IO.h" #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" #include "src/Core/GeneralProduct.h" diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index a8229cf03..fc807577b 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -463,7 +463,17 @@ template class DenseBase EIGEN_DEVICE_FUNC void visit(Visitor& func) const; - inline const WithFormat format(const IOFormat& fmt) const; + /** \returns a WithFormat proxy object allowing to print a matrix the with given + * format \a fmt. + * + * See class IOFormat for some examples. + * + * \sa class IOFormat, class WithFormat + */ + inline const WithFormat format(const IOFormat& fmt) const + { + return WithFormat(derived(), fmt); + } /** \returns the unique coefficient of a 1x1 expression */ EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/IO.h b/Eigen/src/Core/IO.h index 644228c3f..da7fd6cce 100644 --- a/Eigen/src/Core/IO.h +++ b/Eigen/src/Core/IO.h @@ -109,20 +109,6 @@ class WithFormat IOFormat m_format; }; -/** \returns a WithFormat proxy object allowing to print a matrix the with given - * format \a fmt. - * - * See class IOFormat for some examples. - * - * \sa class IOFormat, class WithFormat - */ -template -inline const WithFormat -DenseBase::format(const IOFormat& fmt) const -{ - return WithFormat(derived(), fmt); -} - namespace internal { // NOTE: This helper is kept for backward compatibility with previous code specializing From 3453b00a1ef895a4b2eb5f349ab0bf2d50ca0535 Mon Sep 17 00:00:00 2001 From: Jonathan Hseu Date: Sat, 11 Feb 2017 21:45:32 -0800 Subject: [PATCH 41/54] Fix vector indexing with uint64_t --- Eigen/src/Core/util/Meta.h | 24 +++++++++++++----------- test/basicstuff.cpp | 13 +++++++++++++ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index ee0531b32..11c089020 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -97,17 +97,19 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index 99d91f9da..52b89c0d6 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -49,6 +49,19 @@ template void basicStuff(const MatrixType& m) v1[r] = x; VERIFY_IS_APPROX(x, v1[r]); + // test fetching with various index types. + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + x = v1(static_cast(r)); + VERIFY_IS_APPROX( v1, v1); VERIFY_IS_NOT_APPROX( v1, 2*v1); VERIFY_IS_MUCH_SMALLER_THAN( vzero, v1); From 707343094637b112d49089372a0a8e0a06b0b34c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 13 Feb 2017 17:14:04 +0100 Subject: [PATCH 42/54] Fix overflow and make use of long long in c++11 only. --- test/basicstuff.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index 52b89c0d6..c346ce6cb 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -50,17 +50,20 @@ template void basicStuff(const MatrixType& m) VERIFY_IS_APPROX(x, v1[r]); // test fetching with various index types. - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); - x = v1(static_cast(r)); + Index r1 = internal::random(0, numext::mini(Index(127),rows-1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); +#if EIGEN_HAS_CXX11 + x = v1(static_cast(r1)); + x = v1(static_cast(r1)); +#endif VERIFY_IS_APPROX( v1, v1); VERIFY_IS_NOT_APPROX( v1, 2*v1); From 5937c4ae32feec178d56282694f06ed16cfe7352 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 13 Feb 2017 17:14:26 +0100 Subject: [PATCH 43/54] Fall back is_integral to std::is_integral in c++11 --- Eigen/src/Core/util/Meta.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 11c089020..90eda6e70 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -97,6 +97,9 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +#if EIGEN_HAS_CXX11 +using std::is_integral; +#else template struct is_integral { enum { value = false }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; @@ -108,8 +111,8 @@ template<> struct is_integral { enum { value = true }; } template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +#endif + template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; From 0d153ded29022021c4f7ac24b73a0adb1e423013 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 13 Feb 2017 17:25:12 +0000 Subject: [PATCH 44/54] Adding TensorChippingOP for sycl backend; fixing the index value in the verification operation for cxx11_tensorChipping.cpp test --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 29 +- .../TensorSyclConvertToDeviceExpression.h | 12 + .../src/Tensor/TensorSyclExprConstructor.h | 17 + .../src/Tensor/TensorSyclExtractAccessor.h | 17 +- .../src/Tensor/TensorSyclExtractFunctors.h | 16 + .../CXX11/src/Tensor/TensorSyclLeafCount.h | 11 + .../src/Tensor/TensorSyclPlaceHolderExpr.h | 12 + unsupported/test/CMakeLists.txt | 1 + unsupported/test/cxx11_tensor_chipping.cpp | 8 +- .../test/cxx11_tensor_chipping_sycl.cpp | 622 ++++++++++++++++++ 10 files changed, 732 insertions(+), 13 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_chipping_sycl.cpp diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 1ba7ef170..f335edf7d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -150,7 +150,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -206,7 +206,7 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -218,7 +218,7 @@ struct TensorEvaluator, Device> PacketReturnType rslt = internal::pload(values); return rslt; } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); @@ -274,17 +274,29 @@ struct TensorEvaluator, Device> } } + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { + return m_dim.actualDim(); + } + + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { + return m_offset; + } + /// required by sycl in order to extract the accessor + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -304,6 +316,9 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device& m_device; +// required by sycl + const DenseIndex m_offset; + }; @@ -344,7 +359,7 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; @@ -355,7 +370,7 @@ struct TensorEvaluator, Device> inputIndex += this->m_inputStride; } } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index c0bcf26cd..ee8f3c9c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -146,6 +146,18 @@ KERNELBROKERCONVERTERSLICESTRIDEOP() #undef KERNELBROKERCONVERTERSLICESTRIDEOP +/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp +#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ +template \ +struct ConvertToDeviceExpression > {\ + typedef CVQual TensorChippingOp::Type> Type;\ +}; +KERNELBROKERCONVERTCHIPPINGOP(const) +KERNELBROKERCONVERTCHIPPINGOP() +#undef KERNELBROKERCONVERTCHIPPINGOP + + + } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 56ba82805..3b83b1d2c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -368,6 +368,23 @@ SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) #undef SYCLPADDINGOPEXPRCONST +// TensorChippingOp +#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ +template\ +struct ExprConstructor , CVQual TensorChippingOp, Params... >{\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorChippingOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\ +}; + +SYCLTENSORCHIPPINGOPEXPR(const) +SYCLTENSORCHIPPINGOPEXPR() +#undef SYCLTENSORCHIPPINGOPEXPR + /// template deduction for \ref ExprConstructor struct template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index 3fd607941..b512d43f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -188,7 +188,7 @@ SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) /// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. +/// const TensorSlicingOp. #define SYCLSLICEOPEXTACC(CVQual)\ template \ struct ExtractAccessor, Dev> > {\ @@ -200,7 +200,7 @@ SYCLSLICEOPEXTACC(const) SYCLSLICEOPEXTACC() #undef SYCLSLICEOPEXTACC // specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorStridingSlicingOp. +/// TensorStridingSlicingOp. #define SYCLSLICESTRIDEOPEXTACC(CVQual)\ template\ struct ExtractAccessor, Dev> >{\ @@ -212,6 +212,19 @@ SYCLSLICESTRIDEOPEXTACC(const) SYCLSLICESTRIDEOPEXTACC() #undef SYCLSLICESTRIDEOPEXTACC +// specialisation of the \ref ExtractAccessor struct when the node type is +/// TensorChippingOp. +#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +}; + +SYCLTENSORCHIPPINGOPEXTACC(const) +SYCLTENSORCHIPPINGOPEXTACC() +#undef SYCLTENSORCHIPPINGOPEXTACC + /// template deduction for \ref ExtractAccessor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index e26cbdf6d..ee020184b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -290,6 +290,22 @@ SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) #undef SYCLEXTRFUNCCONTRACTCONCAT +//TensorChippingOp +#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ +template\ +struct FunctorExtractor, Device>>{\ + FunctorExtractor > xprExpr;\ + const DenseIndex m_dim;\ + const DenseIndex m_offset;\ + EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\ + EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\ + FunctorExtractor(const TensorEvaluator, Device>& expr)\ + : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\ +}; + +SYCLEXTRFUNCCHIPPINGOP(const) +SYCLEXTRFUNCCHIPPINGOP() +#undef SYCLEXTRFUNCCHIPPINGOP /// template deduction function for FunctorExtractor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index 0ac51e7bf..a1c112f4d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -139,6 +139,17 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT + +/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp +#define CHIPPINGOPLEAFCOUNT(CVQual)\ +template \ +struct LeafCount >:CategoryCount{}; + +CHIPPINGOPLEAFCOUNT(const) +CHIPPINGOPLEAFCOUNT() +#undef CHIPPINGOPLEAFCOUNT + + #define SLICESTRIDEOPLEAFCOUNT(CVQual)\ template\ struct LeafCount >:CategoryCount{}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index f6e3b4766..74566dcee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -156,6 +156,18 @@ EVALTO() #undef EVALTO +/// specialisation of the \ref PlaceHolderExpression when the node is +/// TensorChippingOp +#define CHIPPINGOP(CVQual)\ +template \ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorChippingOp< DimId, typename CalculateIndex ::ArgType> Type;\ +}; + +CHIPPINGOP(const) +CHIPPINGOP() +#undef CHIPPINGOP + /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLREDUCTION(CVQual)\ diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 9fa479f52..003c9de0b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -166,6 +166,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 1832dec8b..89cf5c7b7 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -43,7 +43,7 @@ static void test_simple_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -75,7 +75,7 @@ static void test_simple_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } @@ -126,7 +126,7 @@ static void test_dynamic_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -158,7 +158,7 @@ static void test_dynamic_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp new file mode 100644 index 000000000..39e4f0a7f --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -0,0 +1,622 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include + +using Eigen::Tensor; + +template +static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor tensor(tensorRange); + Tensor chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template +static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor tensor(tensorRange); + Tensor chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template +static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) { + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor tensor(tensorRange); + + Tensor chip1(chip1TensorRange); + Tensor tensor1(chip1TensorRange); + tensor.setRandom(); + tensor1.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast(sycl_device.allocate(chip1TensorBuffSize)); + DataType* gpu_data_tensor1 = static_cast(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_chip1(gpu_data_chip1, chip1TensorRange); + TensorMap> gpu_tensor1(gpu_data_tensor1, chip1TensorRange); + + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1; + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim3; ++j) { + for (int k = 0; k < sizeDim4; ++k) { + for (int l = 0; l < sizeDim5; ++l) { + float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l); + VERIFY_IS_EQUAL(chip1(i,j,k,l), expected); + } + } + } + } + + array chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}}; + Tensor tensor2(chip2TensorRange); + Tensor chip2(chip2TensorRange); + tensor2.setRandom(); + const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType); + DataType* gpu_data_tensor2 = static_cast(sycl_device.allocate(chip2TensorBuffSize)); + DataType* gpu_data_chip2 = static_cast(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap> gpu_tensor2(gpu_data_tensor2, chip2TensorRange); + TensorMap> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize); + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2; + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim4; ++j) { + for (int k = 0; k < sizeDim5; ++k) { + float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k); + VERIFY_IS_EQUAL(chip2(i,j,k), expected); + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_tensor1); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_tensor2); + sycl_device.deallocate(gpu_data_chip2); +} + +template +static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor tensor(tensorRange); + Tensor input1(tensorRange); + Tensor input2(input2TensorRange); + input1.setRandom(); + input2.setRandom(); + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input1 = static_cast(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input2 = static_cast(sycl_device.allocate(input2TensorBuffSize)); + + TensorMap> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap> gpu_input1(gpu_data_input1, tensorRange); + TensorMap> gpu_input2(gpu_data_input2, input2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize); + gpu_tensor.device(sycl_device)=gpu_input1; + sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize); + gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor input3(input3TensorRange); + input3.setRandom(); + + const size_t input3TensorBuffSize =input3.size()*sizeof(DataType); + DataType* gpu_data_input3 = static_cast(sycl_device.allocate(input3TensorBuffSize)); + TensorMap> gpu_input3(gpu_data_input3, input3TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize); + gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor input4(input4TensorRange); + input4.setRandom(); + + const size_t input4TensorBuffSize =input4.size()*sizeof(DataType); + DataType* gpu_data_input4 = static_cast(sycl_device.allocate(input4TensorBuffSize)); + TensorMap> gpu_input4(gpu_data_input4, input4TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize); + gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor input5(input5TensorRange); + input5.setRandom(); + + const size_t input5TensorBuffSize =input5.size()*sizeof(DataType); + DataType* gpu_data_input5 = static_cast(sycl_device.allocate(input5TensorBuffSize)); + TensorMap> gpu_input5(gpu_data_input5, input5TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize); + gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor input6(input6TensorRange); + input6.setRandom(); + + const size_t input6TensorBuffSize =input6.size()*sizeof(DataType); + DataType* gpu_data_input6 = static_cast(sycl_device.allocate(input6TensorBuffSize)); + TensorMap> gpu_input6(gpu_data_input6, input6TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize); + gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k input7(tensorRange); + input7.setRandom(); + + DataType* gpu_data_input7 = static_cast(sycl_device.allocate(tensorBuffSize)); + TensorMap> gpu_input7(gpu_data_input7, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize); + gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l); + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k void sycl_chipping_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_static_chip_sycl(sycl_device); + test_static_chip_sycl(sycl_device); + test_dynamic_chip_sycl(sycl_device); + test_dynamic_chip_sycl(sycl_device); + test_chip_in_expr(sycl_device); + test_chip_in_expr(sycl_device); + test_chip_as_lvalue_sycl(sycl_device); + test_chip_as_lvalue_sycl(sycl_device); +} +void test_cxx11_tensor_chipping_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_chipping_test_per_device(device)); + } +} From cfa0568ef7fc86f1cc0c18c8e36cdfd757523cc2 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Thu, 16 Feb 2017 10:13:34 -0800 Subject: [PATCH 45/54] Size indices are signed. --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 4cfe300eb..23a74460e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -54,7 +54,7 @@ struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template +template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; From 582b5e39bf5515a0277b8ecf9c40e09748dc7d98 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 17 Feb 2017 14:10:57 +0100 Subject: [PATCH 46/54] bug #1393: enable Matrix/Array explicit ctor from types with conversion operators (was ok with 3.2) --- Eigen/src/Core/DenseStorage.h | 29 ++++++----- Eigen/src/Core/PlainObjectBase.h | 10 +++- test/CMakeLists.txt | 1 + test/constructor.cpp | 84 ++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 12 deletions(-) create mode 100644 test/constructor.cpp diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 82201d96a..02f29cba9 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -13,9 +13,9 @@ #define EIGEN_MATRIXSTORAGE_H #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN; + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN; #else - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) #endif namespace Eigen { @@ -184,12 +184,16 @@ template class DenseSt { internal::plain_array m_data; public: - EIGEN_DEVICE_FUNC DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage() { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} EIGEN_DEVICE_FUNC - DenseStorage(const DenseStorage& other) : m_data(other.m_data) {} + DenseStorage(const DenseStorage& other) : m_data(other.m_data) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { @@ -197,7 +201,7 @@ template class DenseSt return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(rows); @@ -343,7 +347,7 @@ template class DenseStorage(size)), m_rows(rows), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) @@ -351,6 +355,7 @@ template class DenseStorage class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() } m_rows = rows; m_cols = cols; @@ -422,7 +427,7 @@ template class DenseStorage(size)), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); EIGEN_UNUSED_VARIABLE(rows); } @@ -430,6 +435,7 @@ template class DenseStorage(_Rows*other.m_cols)) , m_cols(other.m_cols) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows) internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -477,7 +483,7 @@ template class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() } m_cols = cols; } @@ -495,7 +501,7 @@ template class DenseStorage(size)), m_rows(rows) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); EIGEN_UNUSED_VARIABLE(cols); } @@ -503,6 +509,7 @@ template class DenseStorage(other.m_rows*_Cols)) , m_rows(other.m_rows) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols) internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -550,7 +557,7 @@ template class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() } m_rows = rows; } diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 639fb92bf..77f4f6066 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -812,6 +812,13 @@ class PlainObjectBase : public internal::dense_xpr_base::type this->_set_noalias(other); } + // Initialize an arbitrary matrix from an object convertible to the Derived type. + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Derived& other){ + this->_set_noalias(other); + } + // Initialize an arbitrary matrix from a generic Eigen expression template EIGEN_DEVICE_FUNC @@ -834,7 +841,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type this->derived() = r; } - // For fixed -size arrays: + // For fixed-size Array template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, @@ -846,6 +853,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type Base::setConstant(val0); } + // For fixed-size Array template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Index& val0, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 84a21b3df..d337594f5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -151,6 +151,7 @@ ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") ei_add_test(unalignedassert) ei_add_test(vectorization_logic) ei_add_test(basicstuff) +ei_add_test(constructor) ei_add_test(linearstructure) ei_add_test(integer_types) ei_add_test(unalignedcount) diff --git a/test/constructor.cpp b/test/constructor.cpp new file mode 100644 index 000000000..eec9e2192 --- /dev/null +++ b/test/constructor.cpp @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define TEST_ENABLE_TEMPORARY_TRACKING + +#include "main.h" + +template struct Wrapper +{ + MatrixType m_mat; + inline Wrapper(const MatrixType &x) : m_mat(x) {} + inline operator const MatrixType& () const { return m_mat; } + inline operator MatrixType& () { return m_mat; } +}; + +template void ctor_init1(const MatrixType& m) +{ + // Check logic in PlainObjectBase::_init1 + Index rows = m.rows(); + Index cols = m.cols(); + + MatrixType m0 = MatrixType::Random(rows,cols); + + VERIFY_EVALUATION_COUNT( MatrixType m1(m0), 1); + VERIFY_EVALUATION_COUNT( MatrixType m2(m0+m0), 1); + VERIFY_EVALUATION_COUNT( MatrixType m2(m0.block(0,0,rows,cols)) , 1); + + Wrapper wrapper(m0); + VERIFY_EVALUATION_COUNT( MatrixType m3(wrapper) , 1); +} + + +void test_constructor() +{ + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST_1( ctor_init1(Matrix()) ); + CALL_SUBTEST_1( ctor_init1(Matrix4d()) ); + CALL_SUBTEST_1( ctor_init1(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_1( ctor_init1(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + } + { + Matrix a(123); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Matrix a(123.0); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Matrix a(123); + VERIFY_IS_EQUAL(a[0], 123.f); + } + { + Array a(123); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Array a(123.0); + VERIFY_IS_EQUAL(a[0], 123); + } + { + Array a(123); + VERIFY_IS_EQUAL(a[0], 123.f); + } + { + Array a(123); + VERIFY_IS_EQUAL(a(4), 123); + } + { + Array a(123.0); + VERIFY_IS_EQUAL(a(4), 123); + } + { + Array a(123); + VERIFY_IS_EQUAL(a(4), 123.f); + } +} From cbbf88c4d7bf5c7c74658ae3294e3880106b83b0 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 17 Feb 2017 14:39:02 +0100 Subject: [PATCH 47/54] Use int32_t instead of int in NEON code. Some platforms with 16 bytes int supports ARM NEON. --- Eigen/src/Core/arch/NEON/PacketMath.h | 52 +++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d392bf3ff..84a56bdcc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -46,7 +46,7 @@ typedef uint32x4_t Packet4ui; const Packet4f p4f_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) + const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -83,7 +83,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 0 }; }; -template<> struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef Packet4i type; typedef Packet4i half; // Packet2i intrinsics not implemented yet @@ -105,11 +105,11 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { @@ -117,7 +117,7 @@ template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1(a), countdown); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) +template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { const int32_t i[] = {0, 1, 2, 3}; Packet4i countdown = vld1q_s32(i); @@ -240,20 +240,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { float32x2_t lo, hi; lo = vld1_dup_f32(from); hi = vld1_dup_f32(from+1); return vcombine_f32(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { int32x2_t lo, hi; lo = vld1_dup_s32(from); @@ -261,11 +261,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vcombine_s32(lo, hi); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -276,7 +276,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa res = vsetq_lane_f32(from[3*stride], res, 3); return res; } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { Packet4i res = pset1(0); res = vsetq_lane_s32(from[0*stride], res, 0); @@ -293,7 +293,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, co to[stride*2] = vgetq_lane_f32(from, 2); to[stride*3] = vgetq_lane_f32(from, 3); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) { to[stride*0] = vgetq_lane_s32(from, 0); to[stride*1] = vgetq_lane_s32(from, 1); @@ -301,12 +301,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[stride*3] = vgetq_lane_s32(from, 3); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; @@ -361,7 +361,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) return sum; } -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { int32x2_t a_lo, a_hi, sum; @@ -408,7 +408,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) return vget_lane_f32(prod, 0); } -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { int32x2_t a_lo, a_hi, prod; @@ -436,7 +436,7 @@ template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) return vget_lane_f32(min, 0); } -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { int32x2_t a_lo, a_hi, min; @@ -461,7 +461,7 @@ template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) return vget_lane_f32(max, 0); } -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { int32x2_t a_lo, a_hi, max; From f8a55cc062a3cba8230e621e0d5e855418a2d5e9 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 18 Feb 2017 10:08:13 +0100 Subject: [PATCH 48/54] Fix compilation. --- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index f8121d17b..dcd95de91 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -126,7 +126,7 @@ class TensorStorage, Options_> } else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() } m_dimensions = nbDimensions; } From deefa54a5419cc197bc43a04df1187f140da1efe Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sun, 19 Feb 2017 10:32:54 +0100 Subject: [PATCH 49/54] Fix tracking of temporaries in unit tests --- .../Core/products/GeneralMatrixMatrixTriangular.h | 2 +- Eigen/src/Core/products/TriangularMatrixMatrix.h | 4 ++-- test/main.h | 3 +++ test/permutationmatrices.cpp | 13 ++++++------- test/redux.cpp | 6 ++++-- test/vectorwiseop.cpp | 4 ++-- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5cd2794a4..7122efa60 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -148,7 +148,7 @@ struct tribb_kernel ResMapper res(_res, resStride); gebp_kernel gebp_kernel; - Matrix buffer; + Matrix buffer((internal::constructor_without_unaligned_array_assert())); // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8a2f7cd78..6ec5a8a0b 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; + Matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -284,7 +284,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; + Matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); diff --git a/test/main.h b/test/main.h index 1d5bdc1c4..25d2dcf43 100644 --- a/test/main.h +++ b/test/main.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #if __cplusplus >= 201103L #include @@ -79,10 +80,12 @@ #ifdef TEST_ENABLE_TEMPORARY_TRACKING static long int nb_temporaries; +static long int nb_temporaries_on_assert = -1; inline void on_temporary_creation(long int size) { // here's a great place to set a breakpoint when debugging failures in this test! if(size!=0) nb_temporaries++; + if(nb_temporaries_on_assert>0) assert(nb_temporaries void permutationmatrices(const MatrixType& m) RightPermutationType rp(rv); MatrixType m_permuted = MatrixType::Random(rows,cols); - const int one_if_dynamic = MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0; - VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, one_if_dynamic); // 1 temp for sub expression "lp * m_original" + VERIFY_EVALUATION_COUNT(m_permuted = lp * m_original * rp, 1); // 1 temp for sub expression "lp * m_original" for (int i=0; i void permutationmatrices(const MatrixType& m) VERIFY_IS_APPROX(m_permuted, lm*m_original*rm); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, one_if_dynamic); + VERIFY_EVALUATION_COUNT(m_permuted = lp * m_permuted * rp, 1); VERIFY_IS_APPROX(m_permuted, lm*m_original*rm); VERIFY_IS_APPROX(lp.inverse()*m_permuted*rp.inverse(), m_original); @@ -75,19 +74,19 @@ template void permutationmatrices(const MatrixType& m) // check inplace permutations m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias()= lp.inverse() * m_permuted, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, lp.inverse()*m_original); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp.inverse(), 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, m_original*rp.inverse()); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = lp * m_permuted, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, lp*m_original); m_permuted = m_original; - VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, one_if_dynamic); // 1 temp to allocate the mask + VERIFY_EVALUATION_COUNT(m_permuted.noalias() = m_permuted * rp, 1); // 1 temp to allocate the mask VERIFY_IS_APPROX(m_permuted, m_original*rp); if(rows>1 && cols>1) diff --git a/test/redux.cpp b/test/redux.cpp index 6ddc59c18..989e1057b 100644 --- a/test/redux.cpp +++ b/test/redux.cpp @@ -70,10 +70,10 @@ template void matrixRedux(const MatrixType& m) VERIFY_IS_APPROX(m1.block(r0,c0,0,0).prod(), Scalar(1)); // test nesting complex expression - VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( (m1.matrix()*m1.matrix().transpose()).sum(), (MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1) ); Matrix m2(rows,rows); m2.setRandom(); - VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(), (MatrixType::SizeAtCompileTime==Dynamic ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( ((m1.matrix()*m1.matrix().transpose())+m2).sum(),(MatrixType::IsVectorAtCompileTime && MatrixType::SizeAtCompileTime!=1 ? 0 : 1)); } template void vectorRedux(const VectorType& w) @@ -156,8 +156,10 @@ void test_redux() CALL_SUBTEST_1( matrixRedux(Array()) ); CALL_SUBTEST_2( matrixRedux(Matrix2f()) ); CALL_SUBTEST_2( matrixRedux(Array2f()) ); + CALL_SUBTEST_2( matrixRedux(Array22f()) ); CALL_SUBTEST_3( matrixRedux(Matrix4d()) ); CALL_SUBTEST_3( matrixRedux(Array4d()) ); + CALL_SUBTEST_3( matrixRedux(Array44d()) ); CALL_SUBTEST_4( matrixRedux(MatrixXcf(internal::random(1,maxsize), internal::random(1,maxsize))) ); CALL_SUBTEST_4( matrixRedux(ArrayXXcf(internal::random(1,maxsize), internal::random(1,maxsize))) ); CALL_SUBTEST_5( matrixRedux(MatrixXd (internal::random(1,maxsize), internal::random(1,maxsize))) ); diff --git a/test/vectorwiseop.cpp b/test/vectorwiseop.cpp index 739eacaf3..f3ab561ee 100644 --- a/test/vectorwiseop.cpp +++ b/test/vectorwiseop.cpp @@ -231,12 +231,12 @@ template void vectorwiseop_matrix(const MatrixType& m) Matrix m1m1 = m1 * m1.transpose(); VERIFY_IS_APPROX( (m1 * m1.transpose()).colwise().sum(), m1m1.colwise().sum()); Matrix tmp(rows); - VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), (MatrixType::RowsAtCompileTime==Dynamic ? 1 : 0)); + VERIFY_EVALUATION_COUNT( tmp = (m1 * m1.transpose()).colwise().sum(), 1); m2 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())).eval(); m1 = m1.rowwise() - (m1.colwise().sum()/RealScalar(m1.rows())); VERIFY_IS_APPROX( m1, m2 ); - VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime!=1 ? 1 : 0) ); + VERIFY_EVALUATION_COUNT( m2 = (m1.rowwise() - m1.colwise().sum()/RealScalar(m1.rows())), (MatrixType::RowsAtCompileTime!=1 ? 1 : 0) ); } void test_vectorwiseop() From 63798df0384d2368ec3cdb6cbc5d7543b34e2f1c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 20 Feb 2017 08:16:36 +0100 Subject: [PATCH 50/54] Fix usage of CUDACC_VER --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 12531e342..29c796647 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -837,7 +837,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) From a811a0469655f7ad24a01d219c0afe752214255e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 20 Feb 2017 10:14:21 +0100 Subject: [PATCH 51/54] Silent warning. --- Eigen/src/Core/DenseStorage.h | 14 +++++++------- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 02f29cba9..7958feeb9 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -201,7 +201,7 @@ template class DenseSt return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(rows); @@ -347,7 +347,7 @@ template class DenseStorage(size)), m_rows(rows), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) @@ -408,7 +408,7 @@ template class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; m_cols = cols; @@ -427,7 +427,7 @@ template class DenseStorage(size)), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); EIGEN_UNUSED_VARIABLE(rows); } @@ -483,7 +483,7 @@ template class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_cols = cols; } @@ -501,7 +501,7 @@ template class DenseStorage(size)), m_rows(rows) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); EIGEN_UNUSED_VARIABLE(cols); } @@ -557,7 +557,7 @@ template class DenseStorage(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index dcd95de91..2854a4a17 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -126,7 +126,7 @@ class TensorStorage, Options_> } else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN() + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_dimensions = nbDimensions; } From 65728257036652fe1cb337a19ee68d8ec01462a3 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 20 Feb 2017 13:44:37 +0100 Subject: [PATCH 52/54] bug #1395: fix the use of compile-time vectors as inputs of JacobiSVD. --- Eigen/src/Householder/BlockHouseholder.h | 3 ++- Eigen/src/SVD/JacobiSVD.h | 6 ++++-- test/jacobisvd.cpp | 6 ++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 39bf8c83d..01a7ed188 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -87,7 +87,8 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec const TriangularView V(vectors); // A -= V T V^* A - Matrix tmp = V.adjoint() * mat; // FIXME add .noalias() once the triangular product can work inplace if(forward) tmp = T.template triangularView() * tmp; diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 1337ae987..43488b1e0 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -112,9 +112,11 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix + typedef Matrix TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 3d8d0203d..7f5f71562 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -101,6 +101,12 @@ void test_jacobisvd() // Test on inf/nan matrix CALL_SUBTEST_7( (svd_inf_nan, MatrixXf>()) ); CALL_SUBTEST_10( (svd_inf_nan, MatrixXd>()) ); + + // bug1395 test compile-time vectors as input + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix()) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix()) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix(r)) )); + CALL_SUBTEST_13(( jacobisvd_verify_assert(Matrix(c)) )); } CALL_SUBTEST_7(( jacobisvd(MatrixXf(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); From d8b1f6cebd653a72657388d5d6e37821b294c509 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 20 Feb 2017 14:06:06 +0100 Subject: [PATCH 53/54] bug #1380: for Map<> as input of matrix exponential --- .../src/MatrixFunctions/MatrixExponential.h | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h index 9ad2b9cc8..bb6d9e1fe 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h @@ -61,10 +61,11 @@ struct MatrixExponentialScalingOp * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template -void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V) { - typedef typename NumTraits::Scalar>::Real RealScalar; + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {120.L, 60.L, 12.L, 1.L}; const MatrixType A2 = A * A; const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); @@ -77,9 +78,10 @@ void matrix_exp_pade3(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template -void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L}; const MatrixType A2 = A * A; @@ -94,9 +96,10 @@ void matrix_exp_pade5(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template -void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L}; const MatrixType A2 = A * A; @@ -114,9 +117,10 @@ void matrix_exp_pade7(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template -void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L, 2162160.L, 110880.L, 3960.L, 90.L, 1.L}; @@ -135,9 +139,10 @@ void matrix_exp_pade9(const MatrixType &A, MatrixType &U, MatrixType &V) * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. */ -template -void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L, 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L, @@ -162,9 +167,10 @@ void matrix_exp_pade13(const MatrixType &A, MatrixType &U, MatrixType &V) * This function activates only if your long double is double-double or quadruple. */ #if LDBL_MANT_DIG > 64 -template -void matrix_exp_pade17(const MatrixType &A, MatrixType &U, MatrixType &V) +template +void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V) { + typedef typename MatA::PlainObject MatrixType; typedef typename NumTraits::Scalar>::Real RealScalar; const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L, 100610229646136770560000.L, 15720348382208870400000.L, @@ -342,9 +348,10 @@ struct matrix_exp_computeUV * \param arg argument of matrix exponential (should be plain object) * \param result variable in which result will be stored */ -template -void matrix_exp_compute(const MatrixType& arg, ResultType &result) +template +void matrix_exp_compute(const ArgType& arg, ResultType &result) { + typedef typename ArgType::PlainObject MatrixType; #if LDBL_MANT_DIG > 112 // rarely happens typedef typename traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -354,11 +361,11 @@ void matrix_exp_compute(const MatrixType& arg, ResultType &result) return; } #endif - typename MatrixType::PlainObject U, V; + MatrixType U, V; int squarings; matrix_exp_computeUV::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) - typename MatrixType::PlainObject numer = U + V; - typename MatrixType::PlainObject denom = -U + V; + MatrixType numer = U + V; + MatrixType denom = -U + V; result = denom.partialPivLu().solve(numer); for (int i=0; i Date: Mon, 20 Feb 2017 14:27:26 +0100 Subject: [PATCH 54/54] bug #1394: fix compilation of SelfAdjointEigenSolver(sparse*sparse); --- Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index a9f56c4f5..9ddd553f2 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -414,7 +414,8 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver if(n==1) { - m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]); + m_eivec = matrix; + m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); if(computeEigenvectors) m_eivec.setOnes(n,n); m_info = Success;