[SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen

This commit is contained in:
Mehdi Goli 2023-01-16 07:04:08 +00:00 committed by Antonio Sánchez
parent bae119bb7e
commit b523120687
32 changed files with 305 additions and 332 deletions

View File

@ -505,14 +505,19 @@ endif()
# add SYCL
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
if(EIGEN_TEST_SYCL)
option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
find_package(Threads REQUIRED)
if(EIGEN_SYCL_TRISYCL)
message(STATUS "Using triSYCL")
include(FindTriSYCL)
else()
elseif(EIGEN_SYCL_ComputeCpp)
message(STATUS "Using ComputeCPP SYCL")
include(FindComputeCpp)
set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL)
"Use ComputeCpp driver instead of a 2 steps compilation"
${COMPUTECPP_DRIVER_DEFAULT_VALUE}
)
else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU")
message(STATUS "Using DPCPP")
find_package(DPCPP)
add_definitions(-DSYCL_COMPILER_IS_DPCPP)
endif(EIGEN_SYCL_TRISYCL)
option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
if(EIGEN_DONT_VECTORIZE_SYCL)
message(STATUS "Disabling SYCL vectorization in tests/examples")
# When disabling SYCL vectorization, also disable Eigen default vectorization

View File

@ -82,7 +82,9 @@
#include <cstddef>
#include <cstdlib>
#include <cmath>
#ifndef __SYCL_DEVICE_ONLY__
#include <cassert>
#endif
#include <functional>
#ifndef EIGEN_NO_IO
#include <sstream>

View File

@ -21,193 +21,53 @@
#ifndef EIGEN_PACKET_MATH_SYCL_H
#define EIGEN_PACKET_MATH_SYCL_H
#include <type_traits>
#include "../../InternalHeaderCheck.h"
namespace Eigen {
namespace internal {
#ifdef SYCL_DEVICE_ONLY
#define SYCL_PLOADT_RO(address_space_target) \
template <typename packet_type, int Alignment> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \
typename cl::sycl::multi_ptr< \
const typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
from) { \
typedef typename unpacket_traits<packet_type>::type scalar; \
typedef cl::sycl::multi_ptr< \
scalar, cl::sycl::access::address_space::address_space_target> \
multi_ptr; \
auto res = packet_type( \
static_cast<typename unpacket_traits<packet_type>::type>(0)); \
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
return res; \
}
SYCL_PLOADT_RO(global_space)
SYCL_PLOADT_RO(local_space)
#undef SYCL_PLOADT_RO
#endif
template <typename packet_type, int Alignment, typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, T>& from) {
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
}
#ifdef SYCL_DEVICE_ONLY
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
typename cl::sycl::multi_ptr< \
const typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
from) { \
return ploadt_ro<packet_type, Alignment>(from); \
}
// global space
SYCL_PLOAD(global_space, Unaligned, u)
SYCL_PLOAD(global_space, Aligned, )
// local space
SYCL_PLOAD(local_space, Unaligned, u)
SYCL_PLOAD(local_space, Aligned, )
#undef SYCL_PLOAD
#endif
#define SYCL_PLOAD(Alignment, AlignedType) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
const Eigen::TensorSycl::internal::RangeAccess< \
cl::sycl::access::mode::read_write, \
typename unpacket_traits<packet_type>::type> \
from) { \
return ploadt_ro<packet_type, Alignment>(from); \
}
SYCL_PLOAD(Unaligned, u)
SYCL_PLOAD(Aligned, )
#undef SYCL_PLOAD
#ifdef SYCL_DEVICE_ONLY
/** \internal \returns a packet version of \a *from.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
#define SYCL_PLOADT(address_space_target) \
template <typename packet_type, int Alignment> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \
typename cl::sycl::multi_ptr< \
const typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
from) { \
if (Alignment >= unpacket_traits<packet_type>::alignment) \
return pload<packet_type>(from); \
else \
return ploadu<packet_type>(from); \
}
// global space
SYCL_PLOADT(global_space)
// local space
SYCL_PLOADT(local_space)
#undef SYCL_PLOADT
#endif
template <typename packet_type, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write,
typename unpacket_traits<packet_type>::type>& from) {
return ploadt<packet_type, Alignment>(from.get_pointer());
}
#ifdef SYCL_DEVICE_ONLY
// private_space
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \
#define SYCL_PLOAD(packet_type, AlignedType) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
ploadt_ro<packet_type, Alignment>( \
pload##AlignedType<packet_type>( \
const typename unpacket_traits<packet_type>::type* from) { \
typedef typename unpacket_traits<packet_type>::type scalar; \
auto res = packet_type(static_cast<scalar>(0)); \
res.template load<cl::sycl::access::address_space::private_space>( \
0, const_cast<scalar*>(from)); \
using scalar = typename unpacket_traits<packet_type>::type; \
typedef cl::sycl::multi_ptr< \
const scalar, cl::sycl::access::address_space::generic_space, \
cl::sycl::access::decorated::no> \
multi_ptr; \
packet_type res{}; \
res.load(0, multi_ptr(from)); \
return res; \
}
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
SYCL_PLOAD(cl::sycl::cl_float4, u)
SYCL_PLOAD(cl::sycl::cl_float4, )
SYCL_PLOAD(cl::sycl::cl_double2, u)
SYCL_PLOAD(cl::sycl::cl_double2, )
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
const typename unpacket_traits<packet_type>::type* from) { \
typedef typename unpacket_traits<packet_type>::type scalar; \
auto res = packet_type(static_cast<scalar>(0)); \
res.template load<cl::sycl::access::address_space::private_space>( \
0, const_cast<scalar*>(from)); \
return res; \
}
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
#undef SYCL_PLOAD
#undef SYCL_PLOAD_SPECIAL
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \
#define SYCL_PSTORE(scalar, packet_type, alignment) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
typename cl::sycl::multi_ptr< \
scalar, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
to, \
const packet_type& from) { \
scalar* to, const packet_type& from) { \
typedef cl::sycl::multi_ptr< \
scalar, cl::sycl::access::address_space::address_space_target> \
scalar, cl::sycl::access::address_space::generic_space, \
cl::sycl::access::decorated::no> \
multi_ptr; \
from.store(0, multi_ptr(to)); \
}
// global space
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, )
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, )
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
#undef SYCL_PSTORE
#define SYCL_PSTORE_T(address_space_target) \
template <typename scalar, typename packet_type, int Alignment> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \
typename cl::sycl::multi_ptr< \
scalar, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
to, \
const packet_type& from) { \
if (Alignment) \
pstore(to, from); \
else \
pstoreu(to, from); \
}
SYCL_PSTORE_T(global_space)
SYCL_PSTORE_T(local_space)
#undef SYCL_PSTORE_T
#define SYCL_PSET1(packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
@ -291,22 +151,6 @@ struct get_base_packet<cl::sycl::cl_double2> {
}
};
#define SYCL_PLOAD_DUP(address_space_target) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \
typename cl::sycl::multi_ptr< \
const typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
from) { \
return get_base_packet<packet_type>::get_ploaddup(from); \
}
// global space
SYCL_PLOAD_DUP(global_space)
// local_space
SYCL_PLOAD_DUP(local_space)
#undef SYCL_PLOAD_DUP
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
const typename unpacket_traits<packet_type>::type& a) { \
return get_base_packet<packet_type>::set_plset(a); \
}
SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2)
#undef SYCL_PLSET
#define SYCL_PGATHER(address_space_target) \
template <typename Scalar, typename packet_type> \
EIGEN_DEVICE_FUNC inline packet_type pgather( \
typename cl::sycl::multi_ptr< \
const typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
from, \
Index stride) { \
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
// global space
SYCL_PGATHER(global_space)
// local space
SYCL_PGATHER(local_space)
#undef SYCL_PGATHER
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PGATHER_SPECILIZE
#define SYCL_PSCATTER(address_space_target) \
template <typename Scalar, typename packet_type> \
EIGEN_DEVICE_FUNC inline void pscatter( \
typename cl::sycl::multi_ptr< \
typename unpacket_traits<packet_type>::type, \
cl::sycl::access::address_space::address_space_target>::pointer_t \
to, \
const packet_type& from, Index stride) { \
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
}
// global space
SYCL_PSCATTER(global_space)
// local space
SYCL_PSCATTER(local_space)
#undef SYCL_PSCATTER
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
template <> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend(
}
#endif // SYCL_DEVICE_ONLY
template <typename packet_type, int Alignment, typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write, T>& from) {
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
}
#define SYCL_PLOAD(Alignment, AlignedType) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
const Eigen::TensorSycl::internal::RangeAccess< \
cl::sycl::access::mode::read_write, \
typename unpacket_traits<packet_type>::type> \
from) { \
return ploadt_ro<packet_type, Alignment>(from); \
}
SYCL_PLOAD(Unaligned, u)
SYCL_PLOAD(Aligned, )
#undef SYCL_PLOAD
template <typename packet_type, int Alignment>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
cl::sycl::access::mode::read_write,
typename unpacket_traits<packet_type>::type>& from) {
return ploadt<packet_type, Alignment>(from.get_pointer());
}
#define SYCL_PSTORE(alignment) \
template <typename packet_type> \
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \

View File

@ -223,7 +223,6 @@ class PointerMapper {
m_pointerMap.clear();
EIGEN_THROW_X(
std::out_of_range("The pointer is not registered in the map\n"));
}
--node;
}
@ -550,7 +549,7 @@ struct RangeAccess {
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
typedef T scalar_t;
typedef scalar_t &ref_t;
typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
typedef scalar_t *ptr_t;
// the accessor type does not necessarily the same as T
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
@ -570,7 +569,12 @@ struct RangeAccess {
RangeAccess(std::nullptr_t) : RangeAccess() {}
// This template parameter must be removed and scalar_t should be replaced
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
return (access_.get_pointer().get() + offset_);
typedef cl::sycl::multi_ptr<scalar_t,
cl::sycl::access::address_space::generic_space,
cl::sycl::access::decorated::no>
multi_ptr;
multi_ptr p(access_);
return (p + offset_).get_raw();
}
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {

View File

@ -64,7 +64,7 @@ pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
cl::sycl::rounding_mode::automatic>();
auto b1 = b.template convert<cl::sycl::cl_float,
cl::sycl::rounding_mode::automatic>();
return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
}
template <>

View File

@ -16,5 +16,11 @@ following commands:
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
2. bash eigen_sycl_bench.sh
To compile the floating point GPU benchmarks using Intel DPCPP compiler
/path/to/dpcpp/bin/clang+ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_benchmarks_sycl.cc benchmark_main.cc -lpthread -o eigen_dpcpp_sycl
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
To compile the contraction with DPCPP:
/path/to/dpcpp/bin/clang++ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract

View File

@ -15,7 +15,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
#include <SYCL/sycl.hpp>
#include <CL/sycl.hpp>
#include <fstream>
#include <iostream>
#include <chrono>
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.fill(a_, m_ * k_, T(12));
device_.fill(b_, k_ * n_, T(23));
device_.fill(c_, m_ * n_, T(31));
device_.fill(a_, a_ + (m_ * k_), T(12));
device_.fill(b_, b_ + (k_ * n_), T(23));
device_.fill(c_, c_ + (m_ * n_), T(31));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_;
@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(T));
device_.memset(b_, 23, k_ * n_ * sizeof(T));
device_.memset(c_, 31, m_ * n_ * sizeof(T));
device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));
Eigen::array<TensorIndex, 2> sizeA;
sizeA[0] = m_;

62
cmake/FindDPCPP.cmake Normal file
View File

@ -0,0 +1,62 @@
include_guard()
include(CheckCXXCompilerFlag)
include(FindPackageHandleStandardArgs)
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
"${DPCPP_SYCL_ARCH}" STREQUAL "")
message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
" please specify in DPCPP_SYCL_ARCH")
endif()
set(DPCPP_USER_FLAGS "" CACHE STRING
"Additional user-specified compiler flags for DPC++")
get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
add_library(DPCPP::DPCPP INTERFACE IMPORTED)
set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
endif()
endif()
if(UNIX)
set_target_properties(DPCPP::DPCPP PROPERTIES
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
else()
set_target_properties(DPCPP::DPCPP PROPERTIES
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
endif()
function(add_sycl_to_target)
set(options)
set(one_value_args TARGET)
set(multi_value_args SOURCES)
cmake_parse_arguments(SB_ADD_SYCL
"${options}"
"${one_value_args}"
"${multi_value_args}"
${ARGN}
)
target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
if (NOT target_type STREQUAL "OBJECT_LIBRARY")
target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
endif()
endfunction()

View File

@ -18,6 +18,9 @@
#include <vector>
#include <typeinfo>
#include <functional>
#ifdef EIGEN_USE_SYCL
#include <CL/sycl.hpp>
#endif
// The following includes of STL headers have to be done _before_ the
// definition of macros min() and max(). The reason is that many STL
@ -121,9 +124,7 @@ struct imag {};
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
// B0 is defined in POSIX header termios.h
#define B0 FORBIDDEN_IDENTIFIER
// `I` may be defined by complex.h:
#define I FORBIDDEN_IDENTIFIER
// Unit tests calling Eigen's blas library must preserve the default blocking size
// to avoid troubles.
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@ -301,15 +302,16 @@ namespace Eigen
}
#endif //EIGEN_EXCEPTIONS
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
#define eigen_assert(a) \
if( (!(a)) && (!no_more_assert) ) \
{ \
Eigen::no_more_assert = true; \
if(report_on_cerr_on_assert_failure) \
if(report_on_cerr_on_assert_failure) { \
eigen_plain_assert(a); \
else \
} else { \
EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
} \
}
#ifdef EIGEN_EXCEPTIONS

View File

@ -170,10 +170,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
m_rightImpl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;

View File

@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return derived();
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
EIGEN_DEVICE_FUNC
@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H

View File

@ -438,13 +438,13 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
: Base(op, device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
if (this->isInnerChipping()) {
// m_stride is equal to 1, so let's avoid the integer division.

View File

@ -331,7 +331,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
// Collect dimension-wise indices (subs).
array<Index, Base::NumDims> subs;
@ -360,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
const int packetSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)

View File

@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
const OutputKernelType m_output_kernel;
};
template<typename Derived>
struct TensorContractionEvaluatorBase : internal::no_assignment_operator
{
struct TensorContractionEvaluatorBase {
typedef typename internal::traits<Derived>::Indices Indices;
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
typedef typename internal::traits<Derived>::RightArgType RightArgType;

View File

@ -597,7 +597,7 @@ class TensorContractionKernel {
const TripleDim triple_dim_)
: TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
@ -636,7 +636,7 @@ class TensorContractionKernel {
// privateRes memory of Each computation the compute block function is independent of local and no local concepts as
// it only compute the block on each thread's private memory space
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
PacketReturnType *privateRes) {
PacketReturnType *privateRes) const {
StorageIndex idx = 0;
EIGEN_CONSTEXPR StorageIndex lhs_stride =
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
@ -661,7 +661,7 @@ class TensorContractionKernel {
// class.
template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
};
@ -713,7 +713,7 @@ class TensorContractionKernel {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::enable_if_t<contract_tp == contraction_type::no_local>
extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
const StorageIndex &ncOffset, const StorageIndex cOffset) {
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
@ -833,7 +833,8 @@ class TensorContractionKernel {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
ThreadProperties<StorageIndex> &thread_properties,
TiledMemory &tiled_input_block,
PacketReturnType *privateRes, bool &db_offset) {
PacketReturnType *privateRes, bool &db_offset) const {
// Tiling the Rhs block from global to local memory
extract_block<RHSBlockProperties, is_internal_block>(
rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
@ -871,7 +872,7 @@ class TensorContractionKernel {
template <bool is_internal_block, typename OutPtr>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
ThreadProperties<StorageIndex> &thread_properties,
OutPtr out_ptr) {
OutPtr out_ptr) const {
auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
// Allocate register space
PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
@ -897,7 +898,7 @@ class TensorContractionKernel {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::enable_if_t<contract_tp == contraction_type::local>
extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
const StorageIndex &ncOffset, const StorageIndex cOffset) {
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
EIGEN_CONSTEXPR StorageIndex LoadPerThread =
@ -1035,7 +1036,7 @@ struct GeneralVectorTensor {
nonContractDim(nonContractDim_),
contractDim(contractDim_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto scratch_ptr = scratch.get_pointer();
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
@ -1252,7 +1253,8 @@ struct GeneralScalarContraction {
const StorageIndex rng_)
: scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
auto out_ptr = out_res.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();

View File

@ -57,10 +57,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
input_range(input_range_) {}
template <typename BooleanDim2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
return (boolean_check[0] && boolean_check[1]);
}
void operator()(cl::sycl::nd_item<2> itemID) {
void operator()(cl::sycl::nd_item<2> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
@ -123,11 +123,11 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
kernel_size(kernel_size_),
input_range(input_range_) {}
template <typename BooleanDim3>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
void operator()(cl::sycl::nd_item<3> itemID) {
void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
@ -212,10 +212,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
input_range(input_range_),
numP(numP_) {}
template <typename BooleanDim3>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
void operator()(cl::sycl::nd_item<3> itemID) {
void operator()(cl::sycl::nd_item<3> itemID) const {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};

View File

@ -31,8 +31,7 @@ struct SyclDeviceInfo {
.template get_info<cl::sycl::info::device::local_mem_type>()),
max_work_item_sizes(
queue.get_device()
.template get_info<
cl::sycl::info::device::max_work_item_sizes>()),
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
max_mem_alloc_size(
queue.get_device()
.template get_info<

View File

@ -159,10 +159,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
m_buffer[i] = m_impl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
}

View File

@ -98,7 +98,7 @@ struct TensorEvaluator
return m_data[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
eigen_assert(m_data != NULL);
return m_data[index];
}
@ -122,7 +122,7 @@ struct TensorEvaluator
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
}
@ -137,7 +137,7 @@ struct TensorEvaluator
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
coeffRef(const array<DenseIndex, NumCoords>& coords) {
coeffRef(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
@ -978,7 +978,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
TensorEvaluator<ElseArgType, Device> m_elseImpl;
};
} // end namespace Eigen
#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
template <typename Derived, typename Device>
struct cl::sycl::is_device_copyable<
Eigen::TensorEvaluator<Derived, Device>,
std::enable_if_t<!std::is_trivially_copyable<
Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
#endif
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H

View File

@ -688,12 +688,12 @@ struct ExecExprFunctorKernel {
: evaluator(evaluator_), range(range_) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
cl::sycl::nd_item<1> itemID) {
cl::sycl::nd_item<1> itemID) const {
compute(itemID);
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
compute(const cl::sycl::nd_item<1>& itemID) {
compute(const cl::sycl::nd_item<1>& itemID) const {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
@ -703,7 +703,7 @@ struct ExecExprFunctorKernel {
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
compute(const cl::sycl::nd_item<1>& itemID) {
compute(const cl::sycl::nd_item<1>& itemID) const {
const Index vectorizedRange =
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());

View File

@ -202,12 +202,12 @@ template<typename ArgType, typename Device>
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
this->m_impl.template writePacket<StoreMode>(index, x);
}

View File

@ -267,13 +267,13 @@ template<typename NewDimensions, typename ArgType, typename Device>
TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
@ -733,7 +733,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
: Base(op, device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
if (this->m_is_identity) {
return this->m_impl.coeffRef(index);
@ -743,7 +743,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
if (this->m_is_identity) {
this->m_impl.template writePacket<StoreMode>(index, x);
@ -1085,7 +1085,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef Strides Dimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
if (this->m_is_identity) {
return this->m_impl.coeffRef(index);

View File

@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false;
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_result.bind(cgh);
if(m_result) m_result.bind(cgh);
}
#endif

View File

@ -87,7 +87,7 @@ struct SecondStepFullReducer {
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
void operator()(cl::sycl::nd_item<1> itemID) {
void operator()(cl::sycl::nd_item<1> itemID) const {
// Our empirical research shows that the best performance will be achieved
// when there is only one element per thread to reduce in the second step.
// in this step the second step reduction time is almost negligible.
@ -141,11 +141,11 @@ class FullReductionKernelFunctor {
Index rng_, OpType op_)
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
const cl::sycl::nd_item<1> &itemID) {
const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
Index globalid = itemID.get_global_id(0);
@ -184,7 +184,7 @@ class FullReductionKernelFunctor {
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
const cl::sycl::nd_item<1> &itemID) {
const cl::sycl::nd_item<1> &itemID) const {
auto output_ptr = final_output.get_pointer();
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
@ -228,14 +228,16 @@ class GenericNondeterministicReducer {
range(range_),
num_values_to_reduce(num_values_to_reduce_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
void operator()(cl::sycl::nd_item<1> itemID) const {
//This is to bypass the statefull condition in Eigen meanReducer
Op non_const_functor;
std::memcpy(&non_const_functor, &functor, sizeof (Op));
auto output_accessor_ptr = output_accessor.get_pointer();
/// const cast added as a naive solution to solve the qualifier drop error
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
if (globalid < range) {
CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
evaluator, evaluator.firstInput(globalid), functor, &accum);
evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
}
}
@ -281,7 +283,7 @@ struct PartialReductionKernel {
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
CoeffReturnType &accumulator) {
CoeffReturnType &accumulator) const {
if (globalPId >= num_coeffs_to_preserve) {
return;
}
@ -298,7 +300,7 @@ struct PartialReductionKernel {
global_offset += per_thread_global_stride;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const Index linearLocalThreadId = itemID.get_local_id(0);
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
@ -380,7 +382,7 @@ struct SecondStepPartialReduction {
num_coeffs_to_preserve(num_coeffs_to_preserve_),
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
const Index globalId = itemID.get_global_id(0);
if (globalId >= num_coeffs_to_preserve) return;

View File

@ -441,12 +441,12 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
return this->m_impl.coeffRef(this->reverseIndex(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x) {
void writePacket(Index index, const PacketReturnType& x) const {
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h

View File

@ -109,28 +109,28 @@ struct ScanKernelFunctor {
template <scan_step sst = stp, typename Input>
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
read(const Input &inpt, Index global_id) {
read(const Input &inpt, Index global_id) const {
return inpt.coeff(global_id);
}
template <scan_step sst = stp, typename Input>
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
read(const Input &inpt, Index global_id) {
read(const Input &inpt, Index global_id) const {
return inpt[global_id];
}
template <scan_step sst = stp, typename InclusiveOp>
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
first_step_inclusive_Operation(InclusiveOp inclusive_op) {
first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
inclusive_op();
}
template <scan_step sst = stp, typename InclusiveOp>
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
first_step_inclusive_Operation(InclusiveOp) {}
first_step_inclusive_Operation(InclusiveOp) const {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto out_ptr = out_accessor.get_pointer();
auto tmp_ptr = temp_accessor.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor {
scanParameters(scanParameters_),
accumulator(accumulator_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
auto in_ptr = in_accessor.get_pointer();
auto out_ptr = out_accessor.get_pointer();
@ -473,7 +473,7 @@ struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::Storage Storage;
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
void operator()(Self &self, EvaluatorPointerType data) {
void operator()(Self &self, EvaluatorPointerType data) const {
const Index total_size = internal::array_prod(self.dimensions());
const Index scan_size = self.size();
const Index scan_stride = self.stride();

View File

@ -390,13 +390,13 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
: Base(op, device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);

View File

@ -288,13 +288,13 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
void writePacket(Index index, const PacketReturnType& x) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());

View File

@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp")
set(EIGEN_SYCL ON)
list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
if(EIGEN_SYCL_TRISYCL)
set(CMAKE_CXX_STANDARD 17)
else(EIGEN_SYCL_TRISYCL)
if(EIGEN_SYCL_ComputeCpp)
if(MSVC)
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
else()
@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL)
-no-serial-memop
-Xclang
-cl-mad-enable)
endif(EIGEN_SYCL_TRISYCL)
endif(EIGEN_SYCL_ComputeCpp)
FOREACH(example_src ${examples_SRCS})
GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)

View File

@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
if(EIGEN_TEST_SYCL)
set(EIGEN_SYCL ON)
set(CMAKE_CXX_STANDARD 17)
# Forward CMake options as preprocessor definitions
if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL)
add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
endif()
if(EIGEN_SYCL_TRISYCL)
# triSYCL now requires c++17.
set(CMAKE_CXX_STANDARD 17)
else()
if(EIGEN_SYCL_ComputeCpp)
if(MSVC)
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
else()
@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL)
-no-serial-memop
-Xclang
-cl-mad-enable)
endif()
endif(EIGEN_SYCL_ComputeCpp)
ei_add_test(cxx11_tensor_sycl)
ei_add_test(cxx11_tensor_image_op_sycl)
@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP)
endif()
endif()

View File

@ -27,17 +27,64 @@ using Eigen::TensorMap;
// Functions used to compare the TensorMap implementation on the device with
// the equivalent on the host
namespace cl {
namespace sycl {
template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
namespace SYCL {
template <typename T> T abs(T x) {
return cl::sycl::abs(x);
}
template <> float abs(float x) {
return cl::sycl::fabs(x);
}
template <> double abs(double x) {
return cl::sycl::fabs(x);
}
template <typename T> T square(T x) { return x * x; }
template <typename T> T cube(T x) { return x * x * x; }
template <typename T> T inverse(T x) { return T(1) / x; }
template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
template <typename T> T cwiseMax(T x, T y) {
return cl::sycl::max(x, y);
}
template <typename T> T cwiseMin(T x, T y) {
return cl::sycl::min(x, y);
}
}
#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC) \
struct op_##FUNC { \
template <typename T> \
auto operator()(const T& x) { \
return SYCL::FUNC(x); \
} \
template <typename T> \
auto operator()(const TensorMap<T>& x) { \
return x.FUNC(); \
} \
};
DECLARE_UNARY_STRUCT_NON_SYCL(abs)
DECLARE_UNARY_STRUCT_NON_SYCL(square)
DECLARE_UNARY_STRUCT_NON_SYCL(cube)
DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC) \
struct op_##FUNC { \
template <typename T1, typename T2> \
auto operator()(const T1& x, const T2& y){ \
return SYCL::FUNC(x, y); \
} \
template <typename T1, typename T2> \
auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) { \
return x.FUNC(y); \
} \
};
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
struct EqualAssignment {
template <typename Lhs, typename Rhs>
void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
} \
};
DECLARE_UNARY_STRUCT(abs)
DECLARE_UNARY_STRUCT(sqrt)
DECLARE_UNARY_STRUCT(rsqrt)
DECLARE_UNARY_STRUCT(square)
DECLARE_UNARY_STRUCT(cube)
DECLARE_UNARY_STRUCT(inverse)
DECLARE_UNARY_STRUCT(tanh)
DECLARE_UNARY_STRUCT(exp)
DECLARE_UNARY_STRUCT(expm1)
@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
} \
};
DECLARE_BINARY_STRUCT(cwiseMax)
DECLARE_BINARY_STRUCT(cwiseMin)
#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
struct op_##NAME { \

View File

@ -23,6 +23,13 @@
#include <stdint.h>
#include <iostream>
#ifdef SYCL_COMPILER_IS_DPCPP
template <typename T>
struct cl::sycl::is_device_copyable<
const OffByOneScalar<T>,
std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
#endif
template <typename DataType, int DataLayout, typename IndexType>
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
IndexType sizeDim1 = 100;