From b52312068766eff6c96ff6d2534ab240ccb48a76 Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Mon, 16 Jan 2023 07:04:08 +0000 Subject: [PATCH] [SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen --- CMakeLists.txt | 15 +- Eigen/Core | 2 + Eigen/src/Core/arch/SYCL/PacketMath.h | 283 ++++-------------- Eigen/src/Core/arch/SYCL/SyclMemoryModel.h | 10 +- Eigen/src/Core/arch/SYCL/TypeCasting.h | 2 +- bench/tensors/README | 6 + bench/tensors/tensor_contract_sycl_bench.cc | 14 +- cmake/FindDPCPP.cmake | 62 ++++ test/main.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 4 +- .../CXX11/src/Tensor/TensorConcatenation.h | 4 +- .../CXX11/src/Tensor/TensorContraction.h | 4 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 20 +- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 15 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 6 +- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- .../CXX11/src/Tensor/TensorReductionSycl.h | 22 +- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorScanSycl.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 4 +- unsupported/doc/examples/SYCL/CMakeLists.txt | 7 +- unsupported/test/CMakeLists.txt | 9 +- .../test/cxx11_tensor_builtins_sycl.cpp | 64 +++- unsupported/test/cxx11_tensor_device_sycl.cpp | 7 + 32 files changed, 305 insertions(+), 332 deletions(-) create mode 100644 cmake/FindDPCPP.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index a57caee72..013d53346 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -505,14 +505,19 @@ endif() # add SYCL option(EIGEN_TEST_SYCL "Add Sycl support." OFF) -option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF) if(EIGEN_TEST_SYCL) + option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON) + option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF) + option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable") set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") find_package(Threads REQUIRED) if(EIGEN_SYCL_TRISYCL) message(STATUS "Using triSYCL") include(FindTriSYCL) - else() + elseif(EIGEN_SYCL_ComputeCpp) message(STATUS "Using ComputeCPP SYCL") include(FindComputeCpp) set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF) @@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL) "Use ComputeCpp driver instead of a 2 steps compilation" ${COMPUTECPP_DRIVER_DEFAULT_VALUE} ) + else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP) + set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU") + message(STATUS "Using DPCPP") + find_package(DPCPP) + add_definitions(-DSYCL_COMPILER_IS_DPCPP) endif(EIGEN_SYCL_TRISYCL) - option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF) if(EIGEN_DONT_VECTORIZE_SYCL) message(STATUS "Disabling SYCL vectorization in tests/examples") # When disabling SYCL vectorization, also disable Eigen default vectorization diff --git a/Eigen/Core b/Eigen/Core index 623d735d6..34d6c4ef5 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -82,7 +82,9 @@ #include #include #include +#ifndef __SYCL_DEVICE_ONLY__ #include +#endif #include #ifndef EIGEN_NO_IO #include diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h index 5bc323589..062d50efb 100644 --- a/Eigen/src/Core/arch/SYCL/PacketMath.h +++ b/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -21,193 +21,53 @@ #ifndef EIGEN_PACKET_MATH_SYCL_H #define EIGEN_PACKET_MATH_SYCL_H #include + #include "../../InternalHeaderCheck.h" namespace Eigen { namespace internal { #ifdef SYCL_DEVICE_ONLY - -#define SYCL_PLOADT_RO(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - typedef typename unpacket_traits::type scalar; \ - typedef cl::sycl::multi_ptr< \ - scalar, cl::sycl::access::address_space::address_space_target> \ - multi_ptr; \ - auto res = packet_type( \ - static_cast::type>(0)); \ - res.load(0, multi_ptr(const_cast(from))); \ - return res; \ +#define SYCL_PLOAD(packet_type, AlignedType) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ + pload##AlignedType( \ + const typename unpacket_traits::type* from) { \ + using scalar = typename unpacket_traits::type; \ + typedef cl::sycl::multi_ptr< \ + const scalar, cl::sycl::access::address_space::generic_space, \ + cl::sycl::access::decorated::no> \ + multi_ptr; \ + packet_type res{}; \ + res.load(0, multi_ptr(from)); \ + return res; \ } -SYCL_PLOADT_RO(global_space) -SYCL_PLOADT_RO(local_space) -#undef SYCL_PLOADT_RO -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, T>& from) { - return ploadt_ro(from.get_pointer()); -} - -#ifdef SYCL_DEVICE_ONLY -#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - return ploadt_ro(from); \ - } - -// global space -SYCL_PLOAD(global_space, Unaligned, u) -SYCL_PLOAD(global_space, Aligned, ) -// local space -SYCL_PLOAD(local_space, Unaligned, u) -SYCL_PLOAD(local_space, Aligned, ) +SYCL_PLOAD(cl::sycl::cl_float4, u) +SYCL_PLOAD(cl::sycl::cl_float4, ) +SYCL_PLOAD(cl::sycl::cl_double2, u) +SYCL_PLOAD(cl::sycl::cl_double2, ) #undef SYCL_PLOAD -#endif -#define SYCL_PLOAD(Alignment, AlignedType) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ - const Eigen::TensorSycl::internal::RangeAccess< \ - cl::sycl::access::mode::read_write, \ - typename unpacket_traits::type> \ - from) { \ - return ploadt_ro(from); \ - } -SYCL_PLOAD(Unaligned, u) -SYCL_PLOAD(Aligned, ) -#undef SYCL_PLOAD - -#ifdef SYCL_DEVICE_ONLY -/** \internal \returns a packet version of \a *from. - * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ -#define SYCL_PLOADT(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - if (Alignment >= unpacket_traits::alignment) \ - return pload(from); \ - else \ - return ploadu(from); \ +#define SYCL_PSTORE(scalar, packet_type, alignment) \ + template <> \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + scalar* to, const packet_type& from) { \ + typedef cl::sycl::multi_ptr< \ + scalar, cl::sycl::access::address_space::generic_space, \ + cl::sycl::access::decorated::no> \ + multi_ptr; \ + from.store(0, multi_ptr(to)); \ } -// global space -SYCL_PLOADT(global_space) -// local space -SYCL_PLOADT(local_space) -#undef SYCL_PLOADT -#endif +SYCL_PSTORE(float, cl::sycl::cl_float4, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, u) -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, - typename unpacket_traits::type>& from) { - return ploadt(from.get_pointer()); -} -#ifdef SYCL_DEVICE_ONLY - -// private_space -#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ - ploadt_ro( \ - const typename unpacket_traits::type* from) { \ - typedef typename unpacket_traits::type scalar; \ - auto res = packet_type(static_cast(0)); \ - res.template load( \ - 0, const_cast(from)); \ - return res; \ - } - -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) - -#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \ - const typename unpacket_traits::type* from) { \ - typedef typename unpacket_traits::type scalar; \ - auto res = packet_type(static_cast(0)); \ - res.template load( \ - 0, const_cast(from)); \ - return res; \ - } -SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, ) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, ) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) - -#undef SYCL_PLOAD_SPECIAL - -#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ - typename cl::sycl::multi_ptr< \ - scalar, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from) { \ - typedef cl::sycl::multi_ptr< \ - scalar, cl::sycl::access::address_space::address_space_target> \ - multi_ptr; \ - from.store(0, multi_ptr(to)); \ - } - -// global space -SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) -SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) - -SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) #undef SYCL_PSTORE -#define SYCL_PSTORE_T(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \ - typename cl::sycl::multi_ptr< \ - scalar, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from) { \ - if (Alignment) \ - pstore(to, from); \ - else \ - pstoreu(to, from); \ - } - -SYCL_PSTORE_T(global_space) - -SYCL_PSTORE_T(local_space) - -#undef SYCL_PSTORE_T - #define SYCL_PSET1(packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1( \ @@ -291,22 +151,6 @@ struct get_base_packet { } }; -#define SYCL_PLOAD_DUP(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - return get_base_packet::get_ploaddup(from); \ - } - -// global space -SYCL_PLOAD_DUP(global_space) -// local_space -SYCL_PLOAD_DUP(local_space) -#undef SYCL_PLOAD_DUP - #define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ @@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) const typename unpacket_traits::type& a) { \ return get_base_packet::set_plset(a); \ } - SYCL_PLSET(cl::sycl::cl_float4) SYCL_PLSET(cl::sycl::cl_double2) #undef SYCL_PLSET -#define SYCL_PGATHER(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC inline packet_type pgather( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from, \ - Index stride) { \ - return get_base_packet::get_pgather(from, stride); \ - } - -// global space -SYCL_PGATHER(global_space) -// local space -SYCL_PGATHER(local_space) - -#undef SYCL_PGATHER - #define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ @@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) #undef SYCL_PGATHER_SPECILIZE -#define SYCL_PSCATTER(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC inline void pscatter( \ - typename cl::sycl::multi_ptr< \ - typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from, Index stride) { \ - get_base_packet::set_pscatter(to, from, stride); \ - } - -// global space -SYCL_PSCATTER(global_space) -// local space -SYCL_PSCATTER(local_space) - -#undef SYCL_PSCATTER - #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ template <> \ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( \ @@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend( } #endif // SYCL_DEVICE_ONLY +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, T>& from) { + return ploadt_ro(from.get_pointer()); +} + +#define SYCL_PLOAD(Alignment, AlignedType) \ + template \ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ + const Eigen::TensorSycl::internal::RangeAccess< \ + cl::sycl::access::mode::read_write, \ + typename unpacket_traits::type> \ + from) { \ + return ploadt_ro(from); \ + } +SYCL_PLOAD(Unaligned, u) +SYCL_PLOAD(Aligned, ) +#undef SYCL_PLOAD + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type +ploadt(const Eigen::TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, + typename unpacket_traits::type>& from) { + return ploadt(from.get_pointer()); +} + #define SYCL_PSTORE(alignment) \ template \ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ diff --git a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h index 54eedfaa0..c532c1867 100644 --- a/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +++ b/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h @@ -223,7 +223,6 @@ class PointerMapper { m_pointerMap.clear(); EIGEN_THROW_X( std::out_of_range("The pointer is not registered in the map\n")); - } --node; } @@ -550,7 +549,7 @@ struct RangeAccess { static const auto is_place_holder = cl::sycl::access::placeholder::true_t; typedef T scalar_t; typedef scalar_t &ref_t; - typedef typename cl::sycl::global_ptr::pointer_t ptr_t; + typedef scalar_t *ptr_t; // the accessor type does not necessarily the same as T typedef cl::sycl::accessor @@ -570,7 +569,12 @@ struct RangeAccess { RangeAccess(std::nullptr_t) : RangeAccess() {} // This template parameter must be removed and scalar_t should be replaced EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const { - return (access_.get_pointer().get() + offset_); + typedef cl::sycl::multi_ptr + multi_ptr; + multi_ptr p(access_); + return (p + offset_).get_raw(); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) { diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h index 613e82331..f6f057b32 100644 --- a/Eigen/src/Core/arch/SYCL/TypeCasting.h +++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -64,7 +64,7 @@ pcast( cl::sycl::rounding_mode::automatic>(); auto b1 = b.template convert(); - return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); + return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y()); } template <> diff --git a/bench/tensors/README b/bench/tensors/README index dcbf0217a..acce5a12b 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -16,5 +16,11 @@ following commands: 1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY} 2. bash eigen_sycl_bench.sh +To compile the floating point GPU benchmarks using Intel DPCPP compiler +/path/to/dpcpp/bin/clang+ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_benchmarks_sycl.cc benchmark_main.cc -lpthread -o eigen_dpcpp_sycl + Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu + +To compile the contraction with DPCPP: +/path/to/dpcpp/bin/clang++ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract diff --git a/bench/tensors/tensor_contract_sycl_bench.cc b/bench/tensors/tensor_contract_sycl_bench.cc index c2d098ecc..b1d89d204 100644 --- a/bench/tensors/tensor_contract_sycl_bench.cc +++ b/bench/tensors/tensor_contract_sycl_bench.cc @@ -15,7 +15,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#include +#include #include #include #include @@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T // Initialize the content of the memory pools to prevent asan from // complaining. - device_.fill(a_, m_ * k_, T(12)); - device_.fill(b_, k_ * n_, T(23)); - device_.fill(c_, m_ * n_, T(31)); + device_.fill(a_, a_ + (m_ * k_), T(12)); + device_.fill(b_, b_ + (k_ * n_), T(23)); + device_.fill(c_, c_ + (m_ * n_), T(31)); Eigen::array sizeA; sizeA[0] = m_; @@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd // Initialize the content of the memory pools to prevent asan from // complaining. - device_.memset(a_, 12, m_ * k_ * sizeof(T)); - device_.memset(b_, 23, k_ * n_ * sizeof(T)); - device_.memset(c_, 31, m_ * n_ * sizeof(T)); + device_.memset(a_, T(12), T(m_ * k_ * sizeof(T))); + device_.memset(b_, T(23), T(k_ * n_ * sizeof(T))); + device_.memset(c_, T(31), T(m_ * n_ * sizeof(T))); Eigen::array sizeA; sizeA[0] = m_; diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake new file mode 100644 index 000000000..73aa30f65 --- /dev/null +++ b/cmake/FindDPCPP.cmake @@ -0,0 +1,62 @@ +include_guard() + +include(CheckCXXCompilerFlag) +include(FindPackageHandleStandardArgs) + +if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND + "${DPCPP_SYCL_ARCH}" STREQUAL "") + message(FATAL_ERROR "Architecture required for AMD DPCPP builds," + " please specify in DPCPP_SYCL_ARCH") +endif() + +set(DPCPP_USER_FLAGS "" CACHE STRING + "Additional user-specified compiler flags for DPC++") + +get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY) +find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib") + +add_library(DPCPP::DPCPP INTERFACE IMPORTED) + +set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0") +if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "") + if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa") + list(APPEND DPCPP_FLAGS "-Xsycl-target-backend") + list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}") + elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda") + list(APPEND DPCPP_FLAGS "-Xsycl-target-backend") + list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}") + endif() +endif() + +if(UNIX) + set_target_properties(DPCPP::DPCPP PROPERTIES + INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}" + INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}" + INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR} + INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include") + message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl") +else() + set_target_properties(DPCPP::DPCPP PROPERTIES + INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}" + INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR} + INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl") +endif() + +function(add_sycl_to_target) + set(options) + set(one_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(SB_ADD_SYCL + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) + target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS}) + target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP) + target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17) + get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE) + if (NOT target_type STREQUAL "OBJECT_LIBRARY") + target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS}) + endif() +endfunction() diff --git a/test/main.h b/test/main.h index a52da9eec..853858173 100644 --- a/test/main.h +++ b/test/main.h @@ -18,6 +18,9 @@ #include #include #include +#ifdef EIGEN_USE_SYCL +#include +#endif // The following includes of STL headers have to be done _before_ the // definition of macros min() and max(). The reason is that many STL @@ -121,9 +124,7 @@ struct imag {}; #define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes // B0 is defined in POSIX header termios.h #define B0 FORBIDDEN_IDENTIFIER -// `I` may be defined by complex.h: #define I FORBIDDEN_IDENTIFIER - // Unit tests calling Eigen's blas library must preserve the default blocking size // to avoid troubles. #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS @@ -301,15 +302,16 @@ namespace Eigen } #endif //EIGEN_EXCEPTIONS - #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS + #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS #define eigen_assert(a) \ if( (!(a)) && (!no_more_assert) ) \ { \ Eigen::no_more_assert = true; \ - if(report_on_cerr_on_assert_failure) \ + if(report_on_cerr_on_assert_failure) { \ eigen_plain_assert(a); \ - else \ + } else { \ EIGEN_THROW_X(Eigen::eigen_assert_exception()); \ + } \ } #ifdef EIGEN_EXCEPTIONS diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 0bd3a001b..3365b72dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -170,10 +170,10 @@ struct TensorEvaluator, Device> m_rightImpl.cleanup(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index a4ac2ad6d..e3fc8f4fc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase { internal::TensorExecutor::run(assign, DefaultDevice()); return derived(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } EIGEN_DEVICE_FUNC @@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase { } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H + diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index ef01e306d..e5f3d5eea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -438,13 +438,13 @@ struct TensorEvaluator, Device> : Base(op, device) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { if (this->isInnerChipping()) { // m_stride is equal to 1, so let's avoid the integer division. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 073be8146..dea41dfd2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -331,7 +331,7 @@ template::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { // Collect dimension-wise indices (subs). array subs; @@ -360,7 +360,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { const int packetSize = PacketType::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index c629e44cf..3a917a067 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase -struct TensorContractionEvaluatorBase : internal::no_assignment_operator -{ +struct TensorContractionEvaluatorBase { typedef typename internal::traits::Indices Indices; typedef typename internal::traits::LeftArgType LeftArgType; typedef typename internal::traits::RightArgType RightArgType; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index c6c40776f..526fc816c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -597,7 +597,7 @@ class TensorContractionKernel { const TripleDim triple_dim_) : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { const StorageIndex linearLocalThreadId = itemID.get_local_id(0); const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; @@ -636,7 +636,7 @@ class TensorContractionKernel { // privateRes memory of Each computation the compute block function is independent of local and no local concepts as // it only compute the block on each thread's private memory space EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, - PacketReturnType *privateRes) { + PacketReturnType *privateRes) const { StorageIndex idx = 0; EIGEN_CONSTEXPR StorageIndex lhs_stride = contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; @@ -661,7 +661,7 @@ class TensorContractionKernel { // class. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, - StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const { auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); }; @@ -713,7 +713,7 @@ class TensorContractionKernel { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, - const StorageIndex &ncOffset, const StorageIndex cOffset) { + const StorageIndex &ncOffset, const StorageIndex cOffset) const { EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = @@ -833,7 +833,8 @@ class TensorContractionKernel { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, ThreadProperties &thread_properties, TiledMemory &tiled_input_block, - PacketReturnType *privateRes, bool &db_offset) { + PacketReturnType *privateRes, bool &db_offset) const { + // Tiling the Rhs block from global to local memory extract_block( rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), @@ -871,7 +872,7 @@ class TensorContractionKernel { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, ThreadProperties &thread_properties, - OutPtr out_ptr) { + OutPtr out_ptr) const { auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; // Allocate register space PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { @@ -897,7 +898,7 @@ class TensorContractionKernel { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, - const StorageIndex &ncOffset, const StorageIndex cOffset) { + const StorageIndex &ncOffset, const StorageIndex cOffset) const { EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; EIGEN_CONSTEXPR StorageIndex LoadPerThread = @@ -1035,7 +1036,7 @@ struct GeneralVectorTensor { nonContractDim(nonContractDim_), contractDim(contractDim_) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { auto scratch_ptr = scratch.get_pointer(); const StorageIndex linearLocalThreadId = itemID.get_local_id(0); StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC @@ -1252,7 +1253,8 @@ struct GeneralScalarContraction { const StorageIndex rng_) : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} - EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const { + auto out_ptr = out_res.get_pointer(); auto scratch_ptr = scratch.get_pointer().get(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 3cbc1ab60..7ccb17423 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -57,10 +57,10 @@ struct EigenConvolutionKernel - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const { return (boolean_check[0] && boolean_check[1]); } - void operator()(cl::sycl::nd_item<2> itemID) { + void operator()(cl::sycl::nd_item<2> itemID) const { auto buffer_ptr = buffer_acc.get_pointer(); auto kernel_ptr = kernel_filter.get_pointer(); // the required row to be calculated for the for each plane in shered memory @@ -123,11 +123,11 @@ struct EigenConvolutionKernel - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { return (boolean_check[0] && boolean_check[1] && boolean_check[2]); } - void operator()(cl::sycl::nd_item<3> itemID) { + void operator()(cl::sycl::nd_item<3> itemID) const { auto buffer_ptr = buffer_acc.get_pointer(); auto kernel_ptr = kernel_filter.get_pointer(); // the required row to be calculated for the for each plane in shered memory @@ -212,10 +212,10 @@ struct EigenConvolutionKernel - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const { return (boolean_check[0] && boolean_check[1] && boolean_check[2]); } - void operator()(cl::sycl::nd_item<3> itemID) { + void operator()(cl::sycl::nd_item<3> itemID) const { auto buffer_ptr = buffer_acc.get_pointer(); auto kernel_ptr = kernel_filter.get_pointer(); const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index 84ebe38e8..8fdc8ba5f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -31,8 +31,7 @@ struct SyclDeviceInfo { .template get_info()), max_work_item_sizes( queue.get_device() - .template get_info< - cl::sycl::info::device::max_work_item_sizes>()), + .template get_info>()), max_mem_alloc_size( queue.get_device() .template get_info< @@ -356,7 +355,7 @@ class QueueInterface { return; } const ptrdiff_t count = end - begin; - auto f = [&](cl::sycl::handler &cgh) { + auto f = [&](cl::sycl::handler &cgh) { auto dst_acc = get_typed_range_accessor(cgh, begin, count); cgh.fill(dst_acc, value); }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 0e9cdfe41..7e13c69e3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -159,10 +159,10 @@ struct TensorEvaluator, Device> } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_buffer[i] = m_impl.coeff(i); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const { internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 92ad0f545..2bd94c308 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -98,7 +98,7 @@ struct TensorEvaluator return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{ eigen_assert(m_data != NULL); return m_data[index]; } @@ -122,7 +122,7 @@ struct TensorEvaluator } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { return internal::pstoret(m_data + index, x); } @@ -137,7 +137,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& - coeffRef(const array& coords) { + coeffRef(const array& coords) const { eigen_assert(m_data != NULL); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; @@ -978,7 +978,14 @@ struct TensorEvaluator TensorEvaluator m_elseImpl; }; - } // end namespace Eigen +#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP) +template +struct cl::sycl::is_device_copyable< + Eigen::TensorEvaluator, + std::enable_if_t>::value>> : std::true_type {}; +#endif + #endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index de9bed4ed..f961b4066 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -688,12 +688,12 @@ struct ExecExprFunctorKernel { : evaluator(evaluator_), range(range_) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( - cl::sycl::nd_item<1> itemID) { + cl::sycl::nd_item<1> itemID) const { compute(itemID); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t - compute(const cl::sycl::nd_item<1>& itemID) { + compute(const cl::sycl::nd_item<1>& itemID) const { Index gId = static_cast(itemID.get_global_linear_id()); Index total_threads = itemID.get_global_range(0); @@ -703,7 +703,7 @@ struct ExecExprFunctorKernel { } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t - compute(const cl::sycl::nd_item<1>& itemID) { + compute(const cl::sycl::nd_item<1>& itemID) const { const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize; Index gId = static_cast(itemID.get_global_linear_id()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index d12bd6dd5..8b1408599 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -202,12 +202,12 @@ template typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { return this->m_impl.coeffRef(index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { this->m_impl.template writePacket(index, x); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 020571040..b696190b2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -267,13 +267,13 @@ template TensorBlockDesc; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { return this->m_impl.coeffRef(index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { this->m_impl.template writePacket(index, x); } @@ -733,7 +733,7 @@ struct TensorEvaluator, Device> : Base(op, device) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { if (this->m_is_identity) { return this->m_impl.coeffRef(index); @@ -743,7 +743,7 @@ struct TensorEvaluator, Device> } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { if (this->m_is_identity) { this->m_impl.template writePacket(index, x); @@ -1085,7 +1085,7 @@ struct TensorEvaluator::type PacketReturnType; typedef Strides Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { if (this->m_is_identity) { return this->m_impl.coeffRef(index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e62397a62..ae03ba52d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false; // binding placeholder accessors to a command group handler for SYCL EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { m_impl.bind(cgh); - m_result.bind(cgh); + if(m_result) m_result.bind(cgh); } #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 397870fcf..715797d62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -87,7 +87,7 @@ struct SecondStepFullReducer { SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} - void operator()(cl::sycl::nd_item<1> itemID) { + void operator()(cl::sycl::nd_item<1> itemID) const { // Our empirical research shows that the best performance will be achieved // when there is only one element per thread to reduce in the second step. // in this step the second step reduction time is almost negligible. @@ -141,11 +141,11 @@ class FullReductionKernelFunctor { Index rng_, OpType op_) : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} - void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } + void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( - const cl::sycl::nd_item<1> &itemID) { + const cl::sycl::nd_item<1> &itemID) const { auto output_ptr = final_output.get_pointer(); Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; Index globalid = itemID.get_global_id(0); @@ -184,7 +184,7 @@ class FullReductionKernelFunctor { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t compute_reduction( - const cl::sycl::nd_item<1> &itemID) { + const cl::sycl::nd_item<1> &itemID) const { auto output_ptr = final_output.get_pointer(); Index globalid = itemID.get_global_id(0); Index localid = itemID.get_local_id(0); @@ -228,14 +228,16 @@ class GenericNondeterministicReducer { range(range_), num_values_to_reduce(num_values_to_reduce_) {} - void operator()(cl::sycl::nd_item<1> itemID) { + void operator()(cl::sycl::nd_item<1> itemID) const { + //This is to bypass the statefull condition in Eigen meanReducer + Op non_const_functor; + std::memcpy(&non_const_functor, &functor, sizeof (Op)); auto output_accessor_ptr = output_accessor.get_pointer(); - /// const cast added as a naive solution to solve the qualifier drop error Index globalid = static_cast(itemID.get_global_linear_id()); if (globalid < range) { CoeffReturnType accum = functor.initialize(); Eigen::internal::GenericDimReducer::reduce( - evaluator, evaluator.firstInput(globalid), functor, &accum); + evaluator, evaluator.firstInput(globalid), non_const_functor, &accum); output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); } } @@ -281,7 +283,7 @@ struct PartialReductionKernel { num_coeffs_to_reduce(num_coeffs_to_reduce_) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, - CoeffReturnType &accumulator) { + CoeffReturnType &accumulator) const { if (globalPId >= num_coeffs_to_preserve) { return; } @@ -298,7 +300,7 @@ struct PartialReductionKernel { global_offset += per_thread_global_stride; } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { const Index linearLocalThreadId = itemID.get_local_id(0); Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP : linearLocalThreadId / PannelParameters::LocalThreadSizeR; @@ -380,7 +382,7 @@ struct SecondStepPartialReduction { num_coeffs_to_preserve(num_coeffs_to_preserve_), num_coeffs_to_reduce(num_coeffs_to_reduce_) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { const Index globalId = itemID.get_global_id(0); if (globalId >= num_coeffs_to_preserve) return; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index b5e66b3b8..342d9e9b5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -441,12 +441,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { return this->m_impl.coeffRef(this->reverseIndex(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) { + void writePacket(Index index, const PacketReturnType& x) const { eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); // This code is pilfered from TensorMorphing.h diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h index 636fb7d71..ecc872b3a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h @@ -109,28 +109,28 @@ struct ScanKernelFunctor { template std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - read(const Input &inpt, Index global_id) { + read(const Input &inpt, Index global_id) const { return inpt.coeff(global_id); } template std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - read(const Input &inpt, Index global_id) { + read(const Input &inpt, Index global_id) const { return inpt[global_id]; } template std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - first_step_inclusive_Operation(InclusiveOp inclusive_op) { + first_step_inclusive_Operation(InclusiveOp inclusive_op) const { inclusive_op(); } template std::enable_if_t EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - first_step_inclusive_Operation(InclusiveOp) {} + first_step_inclusive_Operation(InclusiveOp) const {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { auto out_ptr = out_accessor.get_pointer(); auto tmp_ptr = temp_accessor.get_pointer(); auto scratch_ptr = scratch.get_pointer().get(); @@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor { scanParameters(scanParameters_), accumulator(accumulator_) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { auto in_ptr = in_accessor.get_pointer(); auto out_ptr = out_accessor.get_pointer(); @@ -473,7 +473,7 @@ struct ScanLauncher { typedef typename Self::CoeffReturnType CoeffReturnType; typedef typename Self::Storage Storage; typedef typename Self::EvaluatorPointerType EvaluatorPointerType; - void operator()(Self &self, EvaluatorPointerType data) { + void operator()(Self &self, EvaluatorPointerType data) const { const Index total_size = internal::array_prod(self.dimensions()); const Index scan_size = self.size(); const Index scan_stride = self.stride(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 977263aa5..3d97e6684 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -390,13 +390,13 @@ struct TensorEvaluator, Device> : Base(op, device) { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const { return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { EIGEN_ALIGN_MAX std::remove_const_t values[PacketSize]; internal::pstore(values, x); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 609afe3b9..681c4e5df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -288,13 +288,13 @@ struct TensorEvaluator, Device> typedef typename PacketType::type PacketReturnType; static constexpr int PacketSize = PacketType::size; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const { return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) + void writePacket(Index index, const PacketReturnType& x) const { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt index 8ee96f51b..4fe94c6ec 100644 --- a/unsupported/doc/examples/SYCL/CMakeLists.txt +++ b/unsupported/doc/examples/SYCL/CMakeLists.txt @@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp") set(EIGEN_SYCL ON) list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread) -if(EIGEN_SYCL_TRISYCL) - set(CMAKE_CXX_STANDARD 17) -else(EIGEN_SYCL_TRISYCL) +set(CMAKE_CXX_STANDARD 17) +if(EIGEN_SYCL_ComputeCpp) if(MSVC) list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) else() @@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL) -no-serial-memop -Xclang -cl-mad-enable) -endif(EIGEN_SYCL_TRISYCL) +endif(EIGEN_SYCL_ComputeCpp) FOREACH(example_src ${examples_SRCS}) GET_FILENAME_COMPONENT(example ${example_src} NAME_WE) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 21d8c5eb5..45cc4a3d6 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1") if(EIGEN_TEST_SYCL) set(EIGEN_SYCL ON) + set(CMAKE_CXX_STANDARD 17) # Forward CMake options as preprocessor definitions if(EIGEN_SYCL_USE_DEFAULT_SELECTOR) add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR}) @@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL) add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION}) endif() - if(EIGEN_SYCL_TRISYCL) - # triSYCL now requires c++17. - set(CMAKE_CXX_STANDARD 17) - else() + if(EIGEN_SYCL_ComputeCpp) if(MSVC) list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) else() @@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL) -no-serial-memop -Xclang -cl-mad-enable) - endif() + endif(EIGEN_SYCL_ComputeCpp) ei_add_test(cxx11_tensor_sycl) ei_add_test(cxx11_tensor_image_op_sycl) @@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP) endif() endif() - diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index 27a82540f..be7afd200 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -27,17 +27,64 @@ using Eigen::TensorMap; // Functions used to compare the TensorMap implementation on the device with // the equivalent on the host -namespace cl { -namespace sycl { -template T abs(T x) { return cl::sycl::fabs(x); } +namespace SYCL { +template T abs(T x) { + return cl::sycl::abs(x); +} + +template <> float abs(float x) { + return cl::sycl::fabs(x); +} + +template <> double abs(double x) { + return cl::sycl::fabs(x); +} + template T square(T x) { return x * x; } template T cube(T x) { return x * x * x; } template T inverse(T x) { return T(1) / x; } -template T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } -template T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } +template T cwiseMax(T x, T y) { +return cl::sycl::max(x, y); +} +template T cwiseMin(T x, T y) { + return cl::sycl::min(x, y); } } + +#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T& x) { \ + return SYCL::FUNC(x); \ + } \ + template \ + auto operator()(const TensorMap& x) { \ + return x.FUNC(); \ + } \ + }; + +DECLARE_UNARY_STRUCT_NON_SYCL(abs) +DECLARE_UNARY_STRUCT_NON_SYCL(square) +DECLARE_UNARY_STRUCT_NON_SYCL(cube) +DECLARE_UNARY_STRUCT_NON_SYCL(inverse) + +#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T1& x, const T2& y){ \ + return SYCL::FUNC(x, y); \ + } \ + template \ + auto operator()(const TensorMap& x, const TensorMap& y) { \ + return x.FUNC(y); \ + } \ + }; + +DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax) +DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin) + + struct EqualAssignment { template void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } @@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, } \ }; -DECLARE_UNARY_STRUCT(abs) + DECLARE_UNARY_STRUCT(sqrt) DECLARE_UNARY_STRUCT(rsqrt) -DECLARE_UNARY_STRUCT(square) -DECLARE_UNARY_STRUCT(cube) -DECLARE_UNARY_STRUCT(inverse) DECLARE_UNARY_STRUCT(tanh) DECLARE_UNARY_STRUCT(exp) DECLARE_UNARY_STRUCT(expm1) @@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, } \ }; -DECLARE_BINARY_STRUCT(cwiseMax) -DECLARE_BINARY_STRUCT(cwiseMin) #define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ struct op_##NAME { \ diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index d7ff38d34..74e902645 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -23,6 +23,13 @@ #include #include +#ifdef SYCL_COMPILER_IS_DPCPP +template +struct cl::sycl::is_device_copyable< + const OffByOneScalar, + std::enable_if_t>::value>> : std::true_type {}; +#endif + template void test_device_memory(const Eigen::SyclDevice &sycl_device) { IndexType sizeDim1 = 100;