diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 7dfbc92d5..1ac0b2473 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -826,7 +826,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -842,6 +842,84 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) EIGEN_USING_STD_MATH(max); return max EIGEN_NOT_A_MACRO (x,y); } + + +#elif defined(__SYCL_DEVICE_ONLY__) +template +EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) +{ + + return y < x ? y : x; +} + +template +EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) +{ + + return x < y ? y : x; +} + +EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y) +{ + return cl::sycl::min(x,y); +} + +EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y) +{ + return cl::sycl::max(x,y); +} + +EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y) +{ + return cl::sycl::min(x,y); +} + +EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y) +{ + return cl::sycl::max(x,y); +} + +EIGEN_ALWAYS_INLINE long mini(const long & x, const long & y) +{ + return cl::sycl::min(x,y); +} + +EIGEN_ALWAYS_INLINE long maxi(const long & x, const long & y) +{ + return cl::sycl::max(x,y); +} + +EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y) +{ + return cl::sycl::min(x,y); +} + +EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y) +{ + return cl::sycl::max(x,y); +} + + +EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) +{ + return cl::sycl::fmin(x,y); +} + +EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) +{ + return cl::sycl::fmax(x,y); +} + +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return cl::sycl::fmin(x,y); +} + +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return cl::sycl::fmax(x,y); +} + #else template EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 9d141395b..ccaaa6cb2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); #else diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 1fd00d4f6..40dd5d81a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -17,6 +17,32 @@ namespace Eigen { + #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) + + template class MemCopyFunctor { + public: + typedef cl::sycl::accessor read_accessor; + typedef cl::sycl::accessor write_accessor; + + MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} + + void operator()(cl::sycl::nd_item<1> itemID) { + auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); + auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); + auto globalid = itemID.get_global_linear_id(); + if (globalid < m_rng) { + dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; + } + } + + private: + read_accessor m_src_acc; + write_accessor m_dst_acc; + size_t m_rng; + size_t m_i; + size_t m_offset; + }; + EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ auto devices = cl::sycl::device::get_devices(); std::vector::iterator it =devices.begin(); @@ -33,7 +59,6 @@ EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device } return devices; } -#define ConvertToActualTypeSycl(T, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) struct QueueInterface { /// class members: @@ -170,30 +195,6 @@ struct SyclDevice { // some runtime conditions that can be applied here EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } - template class MemCopyFunctor { - public: - typedef cl::sycl::accessor read_accessor; - typedef cl::sycl::accessor write_accessor; - - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} - - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(T, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(T, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } - - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - /// the memcpy function template EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const { auto it1 = m_queue_stream->find_buffer((void*)src); @@ -260,6 +261,17 @@ struct SyclDevice { }); synchronize(); } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } /// No need for sycl it should act the same as CPU version EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index eea25ac33..485a082e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 284f29345..d582ccbe1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -723,7 +723,7 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; @@ -828,6 +828,15 @@ struct TensorEvaluator& impl() const{return m_impl;} + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -862,6 +871,10 @@ struct TensorEvaluator m_offsets; // offset in a flattened shape const Strides m_strides; std::size_t m_block_total_size_max; + //use by sycl + const StartIndices m_exprStartIndices; + //use by sycl + const StopIndices m_exprStopIndices; }; // Eval as lvalue diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 647bcf108..a8e255246 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -200,6 +200,13 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; } + /// used by sycl + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{return m_impl;} + private: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index abb8420a6..48c5f9a47 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -200,9 +200,6 @@ struct InnerReducer { /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. - // Dims dims= self.xprDims(); - //Op functor = reducer; - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator @@ -214,28 +211,6 @@ struct InnerReducer { TensorSycl::internal::ReductionFunctor (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); - - // [=](cl::sycl::nd_item<1> itemID) { - // typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - // auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - // const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - // typedef Eigen::TensorEvaluator DeviceSelf; - // auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); - // auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); - /// const cast added as a naive solution to solve the qualifier drop error - // auto globalid=itemID.get_global_linear_id(); - // if (globalid< range) { - // typename DeviceSelf::CoeffReturnType accum = functor.initialize(); - // GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast(globalid)),const_cast(functor), &accum); - // functor.finalize(accum); - // output_accessor_ptr[globalid]= accum; - // } - // }); }); dev.synchronize(); return false; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index cc13ca963..e940c8a9d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -125,18 +125,31 @@ KERNELBROKERCONVERTSLICEOP() #undef KERNELBROKERCONVERTSLICEOP -#define KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP(OPEXPR, CVQual)\ +#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ +template\ +struct ConvertToDeviceExpression >{\ + typedef CVQual TensorStridingSlicingOp::Type> Type;\ +}; + +KERNELBROKERCONVERTERSLICESTRIDEOP(const) +KERNELBROKERCONVERTERSLICESTRIDEOP() +#undef KERNELBROKERCONVERTERSLICESTRIDEOP + +#define KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(OPEXPR, CVQual)\ template\ struct ConvertToDeviceExpression >{\ typedef CVQual OPEXPR::Type> Type;\ }; -KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP(TensorReshapingOp, const) -KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP(TensorReshapingOp, ) +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, const) +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorPaddingOp, ) -KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP(TensorShufflingOp, const) -KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP(TensorShufflingOp, ) -#undef KERNELBROKERCONVERTRESHAPEANDSHUFFLEOP +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, const) +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorReshapingOp, ) + +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, const) +KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP(TensorShufflingOp, ) +#undef KERNELBROKERCONVERTPADDINGANDRESHAPEANDSHUFFLEOP } // namespace internal } // namespace TensorSycl diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 4433fec01..7b15f93fe 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -231,7 +231,7 @@ SYCLREDUCTIONEXPR() template\ struct ExprConstructor , CVQual TensorSlicingOp, Params... >{\ typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorSlicingOp Type ;\ + typedef CVQual TensorSlicingOp Type;\ my_xpr_type xprExpr;\ Type expr;\ template \ @@ -244,6 +244,22 @@ SYCLSLICEOPEXPR() #undef SYCLSLICEOPEXPR +#define SYCLSLICESTRIDEOPEXPR(CVQual)\ +template\ +struct ExprConstructor, CVQual TensorStridingSlicingOp, Params... >{\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual TensorStridingSlicingOp Type;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\ +}; + +SYCLSLICESTRIDEOPEXPR(const) +SYCLSLICESTRIDEOPEXPR() +#undef SYCLSLICESTRIDEOPEXPR + #define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ template\ struct ExprConstructor , CVQual OPEXPR , Params... >{\ @@ -263,6 +279,23 @@ SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) #undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST +#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ +template\ +struct ExprConstructor , CVQual OPEXPR , Params... >{\ + typedef ExprConstructor my_xpr_type;\ + typedef CVQual OPEXPR Type ;\ + my_xpr_type xprExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ +}; + +SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) +SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) +#undef SYCLPADDINGOPEXPRCONST + + /// template deduction for \ref ExprConstructor struct template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index f5ef05e36..dc8356cf4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -209,7 +209,21 @@ SYCLSLICEOPEXTACC(const) SYCLSLICEOPEXTACC() #undef SYCLSLICEOPEXTACC -#define RESHAPEANDSHUFFOPEXTRACC(OPEXPR, CVQual)\ +#define SYCLSLICESTRIDEOPEXTACC(CVQual)\ +template\ +struct ExtractAccessor, Dev> >{\ + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ + -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\ + return AccessorConstructor::getTuple(cgh, eval.impl());\ + }\ +}; + +SYCLSLICESTRIDEOPEXTACC(const) +SYCLSLICESTRIDEOPEXTACC() +#undef SYCLSLICESTRIDEOPEXTACC + + +#define PADDINGRESHAPEANDSHUFFOPEXTRACC(OPEXPR, CVQual)\ template\ struct ExtractAccessor, Dev> > {\ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ @@ -217,13 +231,17 @@ struct ExtractAccessor, Dev> > {\ return AccessorConstructor::getTuple(cgh, eval.impl());\ }\ }; + +// tensor padding +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, const) +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorPaddingOp, ) // tensor reshaping -RESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, const) -RESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, ) +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, const) +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorReshapingOp, ) /// Tensor shuffling -RESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, const) -RESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, ) -#undef RESHAPEANDSHUFFOPEXTRACC +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, const) +PADDINGRESHAPEANDSHUFFOPEXTRACC(TensorShufflingOp, ) +#undef PADDINGRESHAPEANDSHUFFOPEXTRACC /// template deduction for \ref ExtractAccessor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index 5bc57b59a..1293b14e2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -176,6 +176,24 @@ SYCLEXTRFUNCTSLICEOP(const) SYCLEXTRFUNCTSLICEOP() #undef SYCLEXTRFUNCTSLICEOP +#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ +template\ +struct FunctorExtractor, Dev> >{\ + FunctorExtractor > xprExpr;\ + const StartIndices m_startIndices;\ + const StopIndices m_stopIndices;\ + const Strides m_strides;\ + FunctorExtractor(const TensorEvaluator, Dev>& expr)\ + : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\ + EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\ + EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\ + EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\ +}; + +SYCLEXTRFUNCTSLICESTRIDEOP(const) +SYCLEXTRFUNCTSLICESTRIDEOP() +#undef SYCLEXTRFUNCTSLICESTRIDEOP + // Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory #define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ template\ @@ -192,7 +210,25 @@ SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) -#undef SYCLRESHAPEOPEXPR +#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT + +// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory +#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\ +template\ +struct FunctorExtractor, Dev> > {\ + FunctorExtractor > xprExpr;\ + const Param m_param;\ + typedef typename Eigen::TensorEvaluator, Dev>::Scalar Scalar;\ + const Scalar m_scalar_param;\ + EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\ + EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\ + FunctorExtractor(const Eigen::TensorEvaluator, Dev>& expr)\ + : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\ +}; + +PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const) +PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), ) +#undef PADDINGOPFUNCEXT /// template deduction function for FunctorExtractor template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a548aab29..5d392218e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -124,17 +124,27 @@ SLICEOPLEAFCOUNT(const) SLICEOPLEAFCOUNT() #undef SLICEOPLEAFCOUNT -#define RESHAPEANDSHUFFLELEAFCOUNT(OPEXPR, CVQual)\ +#define SLICESTRIDEOPLEAFCOUNT(CVQual)\ +template\ +struct LeafCount >:CategoryCount{}; + +SLICESTRIDEOPLEAFCOUNT(const) +SLICESTRIDEOPLEAFCOUNT() +#undef SLICESTRIDEOPLEAFCOUNT + +#define PADDINGRESHAPEANDSHUFFLELEAFCOUNT(OPEXPR, CVQual)\ template\ struct LeafCount >:CategoryCount{}; -RESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, const) -RESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, ) +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, const) +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorPaddingOp, ) -RESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, const) -RESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, ) -#undef RESHAPEANDSHUFFLELEAFCOUNT +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, const) +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorReshapingOp, ) +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, const) +PADDINGRESHAPEANDSHUFFLELEAFCOUNT(TensorShufflingOp, ) +#undef PADDINGRESHAPEANDSHUFFLELEAFCOUNT } /// namespace TensorSycl } /// namespace internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index bb042ade2..e1dbd0c6c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -180,18 +180,32 @@ SLICEOPEXPR(const) SLICEOPEXPR() #undef SLICEOPEXPR -#define RESHAPEANDSHUFFLEOPPLH(OPEXP , CVQual)\ + +#define SYCLSLICESTRIDEOPPLH(CVQual)\ +template\ +struct PlaceHolderExpression, N> {\ + typedef CVQual TensorStridingSlicingOp::ArgType> Type;\ +}; + +SYCLSLICESTRIDEOPPLH(const) +SYCLSLICESTRIDEOPPLH() +#undef SYCLSLICESTRIDEOPPLH + +#define PADDINGRESHAPEANDSHUFFLEOPPLH(OPEXP , CVQual)\ template\ struct PlaceHolderExpression, N > {\ typedef CVQual OPEXP::ArgType> Type;\ }; -RESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, const) -RESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, ) +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp, const) +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorPaddingOp,) -RESHAPEANDSHUFFLEOPPLH(TensorShufflingOp, const) -RESHAPEANDSHUFFLEOPPLH(TensorShufflingOp,) -#undef RESHAPEANDSHUFFLEOPPLH +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, const) +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorReshapingOp, ) + +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp, const) +PADDINGRESHAPEANDSHUFFLEOPPLH(TensorShufflingOp,) +#undef PADDINGRESHAPEANDSHUFFLEOPPLH /// template deduction for \ref PlaceHolderExpression struct template diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 197fddab6..e7c1a1bae 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -49,6 +49,11 @@ struct numeric_list { static constexpr std::size_t count = sizeof...(nn); const T values[count] = {nn...}; }; +template +struct numeric_list{ + static constexpr std::size_t count = 0; + //Array of size zero strictly forbiden in ISO C++ +}; #endif diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 0ffa329f5..2fe03e24f 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -148,6 +148,7 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11") + ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index d7f4e8cff..91353b81a 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -180,6 +180,53 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device) sycl_device.deallocate(gpu_data3); } +template +static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) +{ + typedef Tensor Tensor2f; + typedef Eigen::DSizes Index2; + IndexType sizeDim1 = 7L; + IndexType sizeDim2 = 11L; + array tensorRange = {{sizeDim1, sizeDim2}}; + Tensor tensor(tensorRange),tensor2(tensorRange); + IndexType sliceDim1 = 2; + IndexType sliceDim2 = 3; + array sliceRange = {{sliceDim1, sliceDim2}}; + Tensor2f slice(sliceRange); + Index2 strides(1L,1L); + Index2 indicesStart(3L,4L); + Index2 indicesStop(5L,7L); + Index2 lengths(2L,3L); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast(sycl_device.allocate(slice.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); + TensorMap> gpu3(gpu_data3, sliceRange); + + + tensor.setRandom(); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1; + + slice.setRandom(); + sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType)); + + + gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3; + gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + for(int i=0;i void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); @@ -189,6 +236,8 @@ template void sycl_morphing_test_per_d test_simple_reshape(sycl_device); test_reshape_as_lvalue(sycl_device); test_reshape_as_lvalue(sycl_device); + test_strided_slice_write_sycl(sycl_device); + test_strided_slice_write_sycl(sycl_device); } void test_cxx11_tensor_morphing_sycl() { diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp new file mode 100644 index 000000000..9e86e4b52 --- /dev/null +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -0,0 +1,161 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int +#define EIGEN_USE_SYCL + + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template +static void test_simple_padding(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + + Tensor tensor(tensorRange); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + IndexType padedSizeDim1 = 2; + IndexType padedSizeDim2 = 6; + IndexType padedSizeDim3 = 12; + IndexType padedSizeDim4 = 7; + array padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}}; + + Tensor padded(padedtensorRange); + + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(padded.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, padedtensorRange); + + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.pad(paddings); + sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); + for (int i = 0; i < padedSizeDim1; ++i) { + for (int j = 0; j < padedSizeDim2; ++j) { + for (int k = 0; k < padedSizeDim3; ++k) { + for (int l = 0; l < padedSizeDim4; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template +static void test_padded_expr(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + + Tensor tensor(tensorRange); + tensor.setRandom(); + + array, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Eigen::DSizes reshape_dims; + reshape_dims[0] = 12; + reshape_dims[1] = 84; + + + Tensor result(reshape_dims); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(result.size()*sizeof(DataType))); + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, reshape_dims); + + + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 6; ++j) { + for (int k = 0; k < 12; ++k) { + for (int l = 0; l < 7; ++l) { + const float result_value = DataLayout == ColMajor ? + result(i+2*j,k+12*l) : result(j+6*i,l+7*k); + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(result_value, 0.0f); + } + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template void sycl_padding_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_padding(sycl_device); + test_simple_padding(sycl_device); + test_padded_expr(sycl_device); + test_padded_expr(sycl_device); + test_simple_padding(sycl_device); + test_simple_padding(sycl_device); + test_padded_expr(sycl_device); + test_padded_expr(sycl_device); + +} +void test_cxx11_tensor_padding_sycl() +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_padding_test_per_device(device)); + } +}