mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-04-18 15:49:37 +08:00
[SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen
This commit is contained in:
parent
bae119bb7e
commit
b523120687
@ -505,14 +505,19 @@ endif()
|
||||
|
||||
# add SYCL
|
||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
|
||||
if(EIGEN_TEST_SYCL)
|
||||
option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
|
||||
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
|
||||
option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
|
||||
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
|
||||
find_package(Threads REQUIRED)
|
||||
if(EIGEN_SYCL_TRISYCL)
|
||||
message(STATUS "Using triSYCL")
|
||||
include(FindTriSYCL)
|
||||
else()
|
||||
elseif(EIGEN_SYCL_ComputeCpp)
|
||||
message(STATUS "Using ComputeCPP SYCL")
|
||||
include(FindComputeCpp)
|
||||
set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
|
||||
@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL)
|
||||
"Use ComputeCpp driver instead of a 2 steps compilation"
|
||||
${COMPUTECPP_DRIVER_DEFAULT_VALUE}
|
||||
)
|
||||
else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
|
||||
set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU")
|
||||
message(STATUS "Using DPCPP")
|
||||
find_package(DPCPP)
|
||||
add_definitions(-DSYCL_COMPILER_IS_DPCPP)
|
||||
endif(EIGEN_SYCL_TRISYCL)
|
||||
option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
|
||||
if(EIGEN_DONT_VECTORIZE_SYCL)
|
||||
message(STATUS "Disabling SYCL vectorization in tests/examples")
|
||||
# When disabling SYCL vectorization, also disable Eigen default vectorization
|
||||
|
@ -82,7 +82,9 @@
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
#include <cassert>
|
||||
#endif
|
||||
#include <functional>
|
||||
#ifndef EIGEN_NO_IO
|
||||
#include <sstream>
|
||||
|
@ -21,193 +21,53 @@
|
||||
#ifndef EIGEN_PACKET_MATH_SYCL_H
|
||||
#define EIGEN_PACKET_MATH_SYCL_H
|
||||
#include <type_traits>
|
||||
|
||||
#include "../../InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
#ifdef SYCL_DEVICE_ONLY
|
||||
|
||||
#define SYCL_PLOADT_RO(address_space_target) \
|
||||
template <typename packet_type, int Alignment> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
const typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
from) { \
|
||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
scalar, cl::sycl::access::address_space::address_space_target> \
|
||||
multi_ptr; \
|
||||
auto res = packet_type( \
|
||||
static_cast<typename unpacket_traits<packet_type>::type>(0)); \
|
||||
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
|
||||
return res; \
|
||||
#define SYCL_PLOAD(packet_type, AlignedType) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
|
||||
pload##AlignedType<packet_type>( \
|
||||
const typename unpacket_traits<packet_type>::type* from) { \
|
||||
using scalar = typename unpacket_traits<packet_type>::type; \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
const scalar, cl::sycl::access::address_space::generic_space, \
|
||||
cl::sycl::access::decorated::no> \
|
||||
multi_ptr; \
|
||||
packet_type res{}; \
|
||||
res.load(0, multi_ptr(from)); \
|
||||
return res; \
|
||||
}
|
||||
|
||||
SYCL_PLOADT_RO(global_space)
|
||||
SYCL_PLOADT_RO(local_space)
|
||||
#undef SYCL_PLOADT_RO
|
||||
#endif
|
||||
|
||||
template <typename packet_type, int Alignment, typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write, T>& from) {
|
||||
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
|
||||
#ifdef SYCL_DEVICE_ONLY
|
||||
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
const typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
from) { \
|
||||
return ploadt_ro<packet_type, Alignment>(from); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOAD(global_space, Unaligned, u)
|
||||
SYCL_PLOAD(global_space, Aligned, )
|
||||
// local space
|
||||
SYCL_PLOAD(local_space, Unaligned, u)
|
||||
SYCL_PLOAD(local_space, Aligned, )
|
||||
SYCL_PLOAD(cl::sycl::cl_float4, u)
|
||||
SYCL_PLOAD(cl::sycl::cl_float4, )
|
||||
SYCL_PLOAD(cl::sycl::cl_double2, u)
|
||||
SYCL_PLOAD(cl::sycl::cl_double2, )
|
||||
|
||||
#undef SYCL_PLOAD
|
||||
#endif
|
||||
|
||||
#define SYCL_PLOAD(Alignment, AlignedType) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
||||
const Eigen::TensorSycl::internal::RangeAccess< \
|
||||
cl::sycl::access::mode::read_write, \
|
||||
typename unpacket_traits<packet_type>::type> \
|
||||
from) { \
|
||||
return ploadt_ro<packet_type, Alignment>(from); \
|
||||
}
|
||||
SYCL_PLOAD(Unaligned, u)
|
||||
SYCL_PLOAD(Aligned, )
|
||||
#undef SYCL_PLOAD
|
||||
|
||||
#ifdef SYCL_DEVICE_ONLY
|
||||
/** \internal \returns a packet version of \a *from.
|
||||
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
||||
#define SYCL_PLOADT(address_space_target) \
|
||||
template <typename packet_type, int Alignment> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
const typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
from) { \
|
||||
if (Alignment >= unpacket_traits<packet_type>::alignment) \
|
||||
return pload<packet_type>(from); \
|
||||
else \
|
||||
return ploadu<packet_type>(from); \
|
||||
#define SYCL_PSTORE(scalar, packet_type, alignment) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
scalar* to, const packet_type& from) { \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
scalar, cl::sycl::access::address_space::generic_space, \
|
||||
cl::sycl::access::decorated::no> \
|
||||
multi_ptr; \
|
||||
from.store(0, multi_ptr(to)); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOADT(global_space)
|
||||
// local space
|
||||
SYCL_PLOADT(local_space)
|
||||
#undef SYCL_PLOADT
|
||||
#endif
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
|
||||
|
||||
template <typename packet_type, int Alignment>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write,
|
||||
typename unpacket_traits<packet_type>::type>& from) {
|
||||
return ploadt<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
#ifdef SYCL_DEVICE_ONLY
|
||||
|
||||
// private_space
|
||||
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
|
||||
ploadt_ro<packet_type, Alignment>( \
|
||||
const typename unpacket_traits<packet_type>::type* from) { \
|
||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
||||
auto res = packet_type(static_cast<scalar>(0)); \
|
||||
res.template load<cl::sycl::access::address_space::private_space>( \
|
||||
0, const_cast<scalar*>(from)); \
|
||||
return res; \
|
||||
}
|
||||
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
|
||||
|
||||
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
|
||||
const typename unpacket_traits<packet_type>::type* from) { \
|
||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
||||
auto res = packet_type(static_cast<scalar>(0)); \
|
||||
res.template load<cl::sycl::access::address_space::private_space>( \
|
||||
0, const_cast<scalar*>(from)); \
|
||||
return res; \
|
||||
}
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
|
||||
|
||||
#undef SYCL_PLOAD_SPECIAL
|
||||
|
||||
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
scalar, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
to, \
|
||||
const packet_type& from) { \
|
||||
typedef cl::sycl::multi_ptr< \
|
||||
scalar, cl::sycl::access::address_space::address_space_target> \
|
||||
multi_ptr; \
|
||||
from.store(0, multi_ptr(to)); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
|
||||
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
|
||||
#undef SYCL_PSTORE
|
||||
|
||||
#define SYCL_PSTORE_T(address_space_target) \
|
||||
template <typename scalar, typename packet_type, int Alignment> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
scalar, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
to, \
|
||||
const packet_type& from) { \
|
||||
if (Alignment) \
|
||||
pstore(to, from); \
|
||||
else \
|
||||
pstoreu(to, from); \
|
||||
}
|
||||
|
||||
SYCL_PSTORE_T(global_space)
|
||||
|
||||
SYCL_PSTORE_T(local_space)
|
||||
|
||||
#undef SYCL_PSTORE_T
|
||||
|
||||
#define SYCL_PSET1(packet_type) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
|
||||
@ -291,22 +151,6 @@ struct get_base_packet<cl::sycl::cl_double2> {
|
||||
}
|
||||
};
|
||||
|
||||
#define SYCL_PLOAD_DUP(address_space_target) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
const typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
from) { \
|
||||
return get_base_packet<packet_type>::get_ploaddup(from); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOAD_DUP(global_space)
|
||||
// local_space
|
||||
SYCL_PLOAD_DUP(local_space)
|
||||
#undef SYCL_PLOAD_DUP
|
||||
|
||||
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
|
||||
@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
||||
const typename unpacket_traits<packet_type>::type& a) { \
|
||||
return get_base_packet<packet_type>::set_plset(a); \
|
||||
}
|
||||
|
||||
SYCL_PLSET(cl::sycl::cl_float4)
|
||||
SYCL_PLSET(cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PLSET
|
||||
|
||||
#define SYCL_PGATHER(address_space_target) \
|
||||
template <typename Scalar, typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC inline packet_type pgather( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
const typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
from, \
|
||||
Index stride) { \
|
||||
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PGATHER(global_space)
|
||||
// local space
|
||||
SYCL_PGATHER(local_space)
|
||||
|
||||
#undef SYCL_PGATHER
|
||||
|
||||
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||
@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PGATHER_SPECILIZE
|
||||
|
||||
#define SYCL_PSCATTER(address_space_target) \
|
||||
template <typename Scalar, typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC inline void pscatter( \
|
||||
typename cl::sycl::multi_ptr< \
|
||||
typename unpacket_traits<packet_type>::type, \
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
||||
to, \
|
||||
const packet_type& from, Index stride) { \
|
||||
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSCATTER(global_space)
|
||||
// local space
|
||||
SYCL_PSCATTER(local_space)
|
||||
|
||||
#undef SYCL_PSCATTER
|
||||
|
||||
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
|
||||
template <> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
|
||||
@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend(
|
||||
}
|
||||
#endif // SYCL_DEVICE_ONLY
|
||||
|
||||
template <typename packet_type, int Alignment, typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write, T>& from) {
|
||||
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
|
||||
#define SYCL_PLOAD(Alignment, AlignedType) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
||||
const Eigen::TensorSycl::internal::RangeAccess< \
|
||||
cl::sycl::access::mode::read_write, \
|
||||
typename unpacket_traits<packet_type>::type> \
|
||||
from) { \
|
||||
return ploadt_ro<packet_type, Alignment>(from); \
|
||||
}
|
||||
SYCL_PLOAD(Unaligned, u)
|
||||
SYCL_PLOAD(Aligned, )
|
||||
#undef SYCL_PLOAD
|
||||
|
||||
template <typename packet_type, int Alignment>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
|
||||
cl::sycl::access::mode::read_write,
|
||||
typename unpacket_traits<packet_type>::type>& from) {
|
||||
return ploadt<packet_type, Alignment>(from.get_pointer());
|
||||
}
|
||||
|
||||
#define SYCL_PSTORE(alignment) \
|
||||
template <typename packet_type> \
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
|
@ -223,7 +223,6 @@ class PointerMapper {
|
||||
m_pointerMap.clear();
|
||||
EIGEN_THROW_X(
|
||||
std::out_of_range("The pointer is not registered in the map\n"));
|
||||
|
||||
}
|
||||
--node;
|
||||
}
|
||||
@ -550,7 +549,7 @@ struct RangeAccess {
|
||||
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
|
||||
typedef T scalar_t;
|
||||
typedef scalar_t &ref_t;
|
||||
typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
|
||||
typedef scalar_t *ptr_t;
|
||||
|
||||
// the accessor type does not necessarily the same as T
|
||||
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
|
||||
@ -570,7 +569,12 @@ struct RangeAccess {
|
||||
RangeAccess(std::nullptr_t) : RangeAccess() {}
|
||||
// This template parameter must be removed and scalar_t should be replaced
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
|
||||
return (access_.get_pointer().get() + offset_);
|
||||
typedef cl::sycl::multi_ptr<scalar_t,
|
||||
cl::sycl::access::address_space::generic_space,
|
||||
cl::sycl::access::decorated::no>
|
||||
multi_ptr;
|
||||
multi_ptr p(access_);
|
||||
return (p + offset_).get_raw();
|
||||
}
|
||||
template <typename Index>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
|
||||
|
@ -64,7 +64,7 @@ pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
|
||||
cl::sycl::rounding_mode::automatic>();
|
||||
auto b1 = b.template convert<cl::sycl::cl_float,
|
||||
cl::sycl::rounding_mode::automatic>();
|
||||
return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
|
||||
return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -16,5 +16,11 @@ following commands:
|
||||
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
|
||||
2. bash eigen_sycl_bench.sh
|
||||
|
||||
To compile the floating point GPU benchmarks using Intel DPCPP compiler
|
||||
/path/to/dpcpp/bin/clang+ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_benchmarks_sycl.cc benchmark_main.cc -lpthread -o eigen_dpcpp_sycl
|
||||
|
||||
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
|
||||
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
|
||||
|
||||
To compile the contraction with DPCPP:
|
||||
/path/to/dpcpp/bin/clang++ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
|
||||
|
@ -15,7 +15,7 @@
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
|
||||
#include <SYCL/sycl.hpp>
|
||||
#include <CL/sycl.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.fill(a_, m_ * k_, T(12));
|
||||
device_.fill(b_, k_ * n_, T(23));
|
||||
device_.fill(c_, m_ * n_, T(31));
|
||||
device_.fill(a_, a_ + (m_ * k_), T(12));
|
||||
device_.fill(b_, b_ + (k_ * n_), T(23));
|
||||
device_.fill(c_, c_ + (m_ * n_), T(31));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = m_;
|
||||
@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
||||
device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
|
||||
device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
|
||||
device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));
|
||||
|
||||
Eigen::array<TensorIndex, 2> sizeA;
|
||||
sizeA[0] = m_;
|
||||
|
62
cmake/FindDPCPP.cmake
Normal file
62
cmake/FindDPCPP.cmake
Normal file
@ -0,0 +1,62 @@
|
||||
include_guard()
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
|
||||
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
|
||||
"${DPCPP_SYCL_ARCH}" STREQUAL "")
|
||||
message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
|
||||
" please specify in DPCPP_SYCL_ARCH")
|
||||
endif()
|
||||
|
||||
set(DPCPP_USER_FLAGS "" CACHE STRING
|
||||
"Additional user-specified compiler flags for DPC++")
|
||||
|
||||
get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
|
||||
find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
|
||||
|
||||
add_library(DPCPP::DPCPP INTERFACE IMPORTED)
|
||||
|
||||
set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
|
||||
if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
|
||||
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
|
||||
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
|
||||
list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
|
||||
elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
|
||||
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
|
||||
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(UNIX)
|
||||
set_target_properties(DPCPP::DPCPP PROPERTIES
|
||||
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
|
||||
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
|
||||
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
|
||||
message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
|
||||
else()
|
||||
set_target_properties(DPCPP::DPCPP PROPERTIES
|
||||
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
|
||||
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
|
||||
endif()
|
||||
|
||||
function(add_sycl_to_target)
|
||||
set(options)
|
||||
set(one_value_args TARGET)
|
||||
set(multi_value_args SOURCES)
|
||||
cmake_parse_arguments(SB_ADD_SYCL
|
||||
"${options}"
|
||||
"${one_value_args}"
|
||||
"${multi_value_args}"
|
||||
${ARGN}
|
||||
)
|
||||
target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
|
||||
target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
|
||||
target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
|
||||
get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
|
||||
if (NOT target_type STREQUAL "OBJECT_LIBRARY")
|
||||
target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
|
||||
endif()
|
||||
endfunction()
|
12
test/main.h
12
test/main.h
@ -18,6 +18,9 @@
|
||||
#include <vector>
|
||||
#include <typeinfo>
|
||||
#include <functional>
|
||||
#ifdef EIGEN_USE_SYCL
|
||||
#include <CL/sycl.hpp>
|
||||
#endif
|
||||
|
||||
// The following includes of STL headers have to be done _before_ the
|
||||
// definition of macros min() and max(). The reason is that many STL
|
||||
@ -121,9 +124,7 @@ struct imag {};
|
||||
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
|
||||
// B0 is defined in POSIX header termios.h
|
||||
#define B0 FORBIDDEN_IDENTIFIER
|
||||
// `I` may be defined by complex.h:
|
||||
#define I FORBIDDEN_IDENTIFIER
|
||||
|
||||
// Unit tests calling Eigen's blas library must preserve the default blocking size
|
||||
// to avoid troubles.
|
||||
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||
@ -301,15 +302,16 @@ namespace Eigen
|
||||
}
|
||||
#endif //EIGEN_EXCEPTIONS
|
||||
|
||||
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
|
||||
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
|
||||
#define eigen_assert(a) \
|
||||
if( (!(a)) && (!no_more_assert) ) \
|
||||
{ \
|
||||
Eigen::no_more_assert = true; \
|
||||
if(report_on_cerr_on_assert_failure) \
|
||||
if(report_on_cerr_on_assert_failure) { \
|
||||
eigen_plain_assert(a); \
|
||||
else \
|
||||
} else { \
|
||||
EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
|
@ -170,10 +170,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
||||
m_rightImpl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
|
||||
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
|
||||
|
||||
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||
|
@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
|
||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||
return derived();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
|
||||
|
||||
|
@ -438,13 +438,13 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
if (this->isInnerChipping()) {
|
||||
// m_stride is equal to 1, so let's avoid the integer division.
|
||||
|
@ -331,7 +331,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
// Collect dimension-wise indices (subs).
|
||||
array<Index, Base::NumDims> subs;
|
||||
@ -360,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
const int packetSize = PacketType<CoeffReturnType, Device>::size;
|
||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
|
@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
|
||||
const OutputKernelType m_output_kernel;
|
||||
};
|
||||
|
||||
|
||||
template<typename Derived>
|
||||
struct TensorContractionEvaluatorBase : internal::no_assignment_operator
|
||||
{
|
||||
struct TensorContractionEvaluatorBase {
|
||||
typedef typename internal::traits<Derived>::Indices Indices;
|
||||
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
|
||||
typedef typename internal::traits<Derived>::RightArgType RightArgType;
|
||||
|
@ -597,7 +597,7 @@ class TensorContractionKernel {
|
||||
const TripleDim triple_dim_)
|
||||
: TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
||||
const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
|
||||
const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
|
||||
@ -636,7 +636,7 @@ class TensorContractionKernel {
|
||||
// privateRes memory of Each computation the compute block function is independent of local and no local concepts as
|
||||
// it only compute the block on each thread's private memory space
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
|
||||
PacketReturnType *privateRes) {
|
||||
PacketReturnType *privateRes) const {
|
||||
StorageIndex idx = 0;
|
||||
EIGEN_CONSTEXPR StorageIndex lhs_stride =
|
||||
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
|
||||
@ -661,7 +661,7 @@ class TensorContractionKernel {
|
||||
// class.
|
||||
template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
|
||||
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
|
||||
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
|
||||
auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
|
||||
return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
|
||||
};
|
||||
@ -713,7 +713,7 @@ class TensorContractionKernel {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::enable_if_t<contract_tp == contraction_type::no_local>
|
||||
extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
|
||||
const StorageIndex &ncOffset, const StorageIndex cOffset) {
|
||||
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
|
||||
EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
|
||||
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
|
||||
EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
|
||||
@ -833,7 +833,8 @@ class TensorContractionKernel {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
|
||||
ThreadProperties<StorageIndex> &thread_properties,
|
||||
TiledMemory &tiled_input_block,
|
||||
PacketReturnType *privateRes, bool &db_offset) {
|
||||
PacketReturnType *privateRes, bool &db_offset) const {
|
||||
|
||||
// Tiling the Rhs block from global to local memory
|
||||
extract_block<RHSBlockProperties, is_internal_block>(
|
||||
rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
|
||||
@ -871,7 +872,7 @@ class TensorContractionKernel {
|
||||
template <bool is_internal_block, typename OutPtr>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
|
||||
ThreadProperties<StorageIndex> &thread_properties,
|
||||
OutPtr out_ptr) {
|
||||
OutPtr out_ptr) const {
|
||||
auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
|
||||
// Allocate register space
|
||||
PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
|
||||
@ -897,7 +898,7 @@ class TensorContractionKernel {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
std::enable_if_t<contract_tp == contraction_type::local>
|
||||
extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
|
||||
const StorageIndex &ncOffset, const StorageIndex cOffset) {
|
||||
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
|
||||
EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
|
||||
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
|
||||
EIGEN_CONSTEXPR StorageIndex LoadPerThread =
|
||||
@ -1035,7 +1036,7 @@ struct GeneralVectorTensor {
|
||||
nonContractDim(nonContractDim_),
|
||||
contractDim(contractDim_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
auto scratch_ptr = scratch.get_pointer();
|
||||
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
||||
StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
|
||||
@ -1252,7 +1253,8 @@ struct GeneralScalarContraction {
|
||||
const StorageIndex rng_)
|
||||
: scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
|
||||
auto out_ptr = out_res.get_pointer();
|
||||
auto scratch_ptr = scratch.get_pointer().get();
|
||||
|
||||
|
@ -57,10 +57,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
input_range(input_range_) {}
|
||||
|
||||
template <typename BooleanDim2>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
|
||||
return (boolean_check[0] && boolean_check[1]);
|
||||
}
|
||||
void operator()(cl::sycl::nd_item<2> itemID) {
|
||||
void operator()(cl::sycl::nd_item<2> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
// the required row to be calculated for the for each plane in shered memory
|
||||
@ -123,11 +123,11 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
kernel_size(kernel_size_),
|
||||
input_range(input_range_) {}
|
||||
template <typename BooleanDim3>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
|
||||
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
||||
}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
// the required row to be calculated for the for each plane in shered memory
|
||||
@ -212,10 +212,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
||||
input_range(input_range_),
|
||||
numP(numP_) {}
|
||||
template <typename BooleanDim3>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
|
||||
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
||||
}
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||
auto buffer_ptr = buffer_acc.get_pointer();
|
||||
auto kernel_ptr = kernel_filter.get_pointer();
|
||||
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
|
||||
|
@ -31,8 +31,7 @@ struct SyclDeviceInfo {
|
||||
.template get_info<cl::sycl::info::device::local_mem_type>()),
|
||||
max_work_item_sizes(
|
||||
queue.get_device()
|
||||
.template get_info<
|
||||
cl::sycl::info::device::max_work_item_sizes>()),
|
||||
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
|
||||
max_mem_alloc_size(
|
||||
queue.get_device()
|
||||
.template get_info<
|
||||
@ -356,7 +355,7 @@ class QueueInterface {
|
||||
return;
|
||||
}
|
||||
const ptrdiff_t count = end - begin;
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
auto f = [&](cl::sycl::handler &cgh) {
|
||||
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
|
||||
cgh.fill(dst_acc, value);
|
||||
};
|
||||
|
@ -159,10 +159,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
|
||||
m_buffer[i] = m_impl.coeff(i);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
|
||||
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
||||
}
|
||||
|
||||
|
@ -98,7 +98,7 @@ struct TensorEvaluator
|
||||
return m_data[index];
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
|
||||
eigen_assert(m_data != NULL);
|
||||
return m_data[index];
|
||||
}
|
||||
@ -122,7 +122,7 @@ struct TensorEvaluator
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
|
||||
}
|
||||
@ -137,7 +137,7 @@ struct TensorEvaluator
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
|
||||
coeffRef(const array<DenseIndex, NumCoords>& coords) {
|
||||
coeffRef(const array<DenseIndex, NumCoords>& coords) const {
|
||||
eigen_assert(m_data != NULL);
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
return m_data[m_dims.IndexOfColMajor(coords)];
|
||||
@ -978,7 +978,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
TensorEvaluator<ElseArgType, Device> m_elseImpl;
|
||||
};
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
|
||||
template <typename Derived, typename Device>
|
||||
struct cl::sycl::is_device_copyable<
|
||||
Eigen::TensorEvaluator<Derived, Device>,
|
||||
std::enable_if_t<!std::is_trivially_copyable<
|
||||
Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
||||
|
@ -688,12 +688,12 @@ struct ExecExprFunctorKernel {
|
||||
: evaluator(evaluator_), range(range_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
|
||||
cl::sycl::nd_item<1> itemID) {
|
||||
cl::sycl::nd_item<1> itemID) const {
|
||||
compute(itemID);
|
||||
}
|
||||
template <bool is_vec = Evaluator::PacketAccess>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
|
||||
compute(const cl::sycl::nd_item<1>& itemID) {
|
||||
compute(const cl::sycl::nd_item<1>& itemID) const {
|
||||
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
||||
Index total_threads = itemID.get_global_range(0);
|
||||
|
||||
@ -703,7 +703,7 @@ struct ExecExprFunctorKernel {
|
||||
}
|
||||
template <bool is_vec = Evaluator::PacketAccess>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
|
||||
compute(const cl::sycl::nd_item<1>& itemID) {
|
||||
compute(const cl::sycl::nd_item<1>& itemID) const {
|
||||
const Index vectorizedRange =
|
||||
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
|
||||
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
||||
|
@ -202,12 +202,12 @@ template<typename ArgType, typename Device>
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
return this->m_impl.coeffRef(index);
|
||||
}
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||
}
|
||||
|
@ -267,13 +267,13 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
||||
TensorBlockDesc;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
return this->m_impl.coeffRef(index);
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||
}
|
||||
@ -733,7 +733,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
if (this->m_is_identity) {
|
||||
return this->m_impl.coeffRef(index);
|
||||
@ -743,7 +743,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
if (this->m_is_identity) {
|
||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||
@ -1085,7 +1085,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
if (this->m_is_identity) {
|
||||
return this->m_impl.coeffRef(index);
|
||||
|
@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false;
|
||||
// binding placeholder accessors to a command group handler for SYCL
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||
m_impl.bind(cgh);
|
||||
m_result.bind(cgh);
|
||||
if(m_result) m_result.bind(cgh);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -87,7 +87,7 @@ struct SecondStepFullReducer {
|
||||
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
|
||||
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
// Our empirical research shows that the best performance will be achieved
|
||||
// when there is only one element per thread to reduce in the second step.
|
||||
// in this step the second step reduction time is almost negligible.
|
||||
@ -141,11 +141,11 @@ class FullReductionKernelFunctor {
|
||||
Index rng_, OpType op_)
|
||||
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
|
||||
void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
|
||||
|
||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
|
||||
const cl::sycl::nd_item<1> &itemID) {
|
||||
const cl::sycl::nd_item<1> &itemID) const {
|
||||
auto output_ptr = final_output.get_pointer();
|
||||
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
|
||||
Index globalid = itemID.get_global_id(0);
|
||||
@ -184,7 +184,7 @@ class FullReductionKernelFunctor {
|
||||
|
||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
|
||||
const cl::sycl::nd_item<1> &itemID) {
|
||||
const cl::sycl::nd_item<1> &itemID) const {
|
||||
auto output_ptr = final_output.get_pointer();
|
||||
Index globalid = itemID.get_global_id(0);
|
||||
Index localid = itemID.get_local_id(0);
|
||||
@ -228,14 +228,16 @@ class GenericNondeterministicReducer {
|
||||
range(range_),
|
||||
num_values_to_reduce(num_values_to_reduce_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
//This is to bypass the statefull condition in Eigen meanReducer
|
||||
Op non_const_functor;
|
||||
std::memcpy(&non_const_functor, &functor, sizeof (Op));
|
||||
auto output_accessor_ptr = output_accessor.get_pointer();
|
||||
/// const cast added as a naive solution to solve the qualifier drop error
|
||||
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
|
||||
if (globalid < range) {
|
||||
CoeffReturnType accum = functor.initialize();
|
||||
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
|
||||
evaluator, evaluator.firstInput(globalid), functor, &accum);
|
||||
evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
|
||||
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
|
||||
}
|
||||
}
|
||||
@ -281,7 +283,7 @@ struct PartialReductionKernel {
|
||||
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
|
||||
CoeffReturnType &accumulator) {
|
||||
CoeffReturnType &accumulator) const {
|
||||
if (globalPId >= num_coeffs_to_preserve) {
|
||||
return;
|
||||
}
|
||||
@ -298,7 +300,7 @@ struct PartialReductionKernel {
|
||||
global_offset += per_thread_global_stride;
|
||||
}
|
||||
}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
const Index linearLocalThreadId = itemID.get_local_id(0);
|
||||
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
|
||||
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
|
||||
@ -380,7 +382,7 @@ struct SecondStepPartialReduction {
|
||||
num_coeffs_to_preserve(num_coeffs_to_preserve_),
|
||||
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
const Index globalId = itemID.get_global_id(0);
|
||||
|
||||
if (globalId >= num_coeffs_to_preserve) return;
|
||||
|
@ -441,12 +441,12 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
|
||||
return this->m_impl.coeffRef(this->reverseIndex(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x) {
|
||||
void writePacket(Index index, const PacketReturnType& x) const {
|
||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||
|
||||
// This code is pilfered from TensorMorphing.h
|
||||
|
@ -109,28 +109,28 @@ struct ScanKernelFunctor {
|
||||
template <scan_step sst = stp, typename Input>
|
||||
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE
|
||||
read(const Input &inpt, Index global_id) {
|
||||
read(const Input &inpt, Index global_id) const {
|
||||
return inpt.coeff(global_id);
|
||||
}
|
||||
|
||||
template <scan_step sst = stp, typename Input>
|
||||
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE
|
||||
read(const Input &inpt, Index global_id) {
|
||||
read(const Input &inpt, Index global_id) const {
|
||||
return inpt[global_id];
|
||||
}
|
||||
|
||||
template <scan_step sst = stp, typename InclusiveOp>
|
||||
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
first_step_inclusive_Operation(InclusiveOp inclusive_op) {
|
||||
first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
|
||||
inclusive_op();
|
||||
}
|
||||
|
||||
template <scan_step sst = stp, typename InclusiveOp>
|
||||
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
first_step_inclusive_Operation(InclusiveOp) {}
|
||||
first_step_inclusive_Operation(InclusiveOp) const {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
auto out_ptr = out_accessor.get_pointer();
|
||||
auto tmp_ptr = temp_accessor.get_pointer();
|
||||
auto scratch_ptr = scratch.get_pointer().get();
|
||||
@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor {
|
||||
scanParameters(scanParameters_),
|
||||
accumulator(accumulator_) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||
auto in_ptr = in_accessor.get_pointer();
|
||||
auto out_ptr = out_accessor.get_pointer();
|
||||
|
||||
@ -473,7 +473,7 @@ struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
|
||||
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||
typedef typename Self::Storage Storage;
|
||||
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
|
||||
void operator()(Self &self, EvaluatorPointerType data) {
|
||||
void operator()(Self &self, EvaluatorPointerType data) const {
|
||||
const Index total_size = internal::array_prod(self.dimensions());
|
||||
const Index scan_size = self.size();
|
||||
const Index scan_stride = self.stride();
|
||||
|
@ -390,13 +390,13 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
||||
: Base(op, device)
|
||||
{ }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
|
@ -288,13 +288,13 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
|
||||
{
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
void writePacket(Index index, const PacketReturnType& x) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
||||
|
@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp")
|
||||
|
||||
set(EIGEN_SYCL ON)
|
||||
list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
|
||||
if(EIGEN_SYCL_TRISYCL)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
else(EIGEN_SYCL_TRISYCL)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
if(EIGEN_SYCL_ComputeCpp)
|
||||
if(MSVC)
|
||||
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
||||
else()
|
||||
@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL)
|
||||
-no-serial-memop
|
||||
-Xclang
|
||||
-cl-mad-enable)
|
||||
endif(EIGEN_SYCL_TRISYCL)
|
||||
endif(EIGEN_SYCL_ComputeCpp)
|
||||
|
||||
FOREACH(example_src ${examples_SRCS})
|
||||
GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
|
||||
|
@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
|
||||
|
||||
if(EIGEN_TEST_SYCL)
|
||||
set(EIGEN_SYCL ON)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
# Forward CMake options as preprocessor definitions
|
||||
if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
|
||||
add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
|
||||
@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL)
|
||||
add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
|
||||
endif()
|
||||
|
||||
if(EIGEN_SYCL_TRISYCL)
|
||||
# triSYCL now requires c++17.
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
else()
|
||||
if(EIGEN_SYCL_ComputeCpp)
|
||||
if(MSVC)
|
||||
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
||||
else()
|
||||
@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL)
|
||||
-no-serial-memop
|
||||
-Xclang
|
||||
-cl-mad-enable)
|
||||
endif()
|
||||
endif(EIGEN_SYCL_ComputeCpp)
|
||||
|
||||
ei_add_test(cxx11_tensor_sycl)
|
||||
ei_add_test(cxx11_tensor_image_op_sycl)
|
||||
@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP)
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
|
@ -27,17 +27,64 @@ using Eigen::TensorMap;
|
||||
|
||||
// Functions used to compare the TensorMap implementation on the device with
|
||||
// the equivalent on the host
|
||||
namespace cl {
|
||||
namespace sycl {
|
||||
template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
|
||||
namespace SYCL {
|
||||
template <typename T> T abs(T x) {
|
||||
return cl::sycl::abs(x);
|
||||
}
|
||||
|
||||
template <> float abs(float x) {
|
||||
return cl::sycl::fabs(x);
|
||||
}
|
||||
|
||||
template <> double abs(double x) {
|
||||
return cl::sycl::fabs(x);
|
||||
}
|
||||
|
||||
template <typename T> T square(T x) { return x * x; }
|
||||
template <typename T> T cube(T x) { return x * x * x; }
|
||||
template <typename T> T inverse(T x) { return T(1) / x; }
|
||||
template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
|
||||
template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
|
||||
template <typename T> T cwiseMax(T x, T y) {
|
||||
return cl::sycl::max(x, y);
|
||||
}
|
||||
template <typename T> T cwiseMin(T x, T y) {
|
||||
return cl::sycl::min(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC) \
|
||||
struct op_##FUNC { \
|
||||
template <typename T> \
|
||||
auto operator()(const T& x) { \
|
||||
return SYCL::FUNC(x); \
|
||||
} \
|
||||
template <typename T> \
|
||||
auto operator()(const TensorMap<T>& x) { \
|
||||
return x.FUNC(); \
|
||||
} \
|
||||
};
|
||||
|
||||
DECLARE_UNARY_STRUCT_NON_SYCL(abs)
|
||||
DECLARE_UNARY_STRUCT_NON_SYCL(square)
|
||||
DECLARE_UNARY_STRUCT_NON_SYCL(cube)
|
||||
DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
|
||||
|
||||
#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC) \
|
||||
struct op_##FUNC { \
|
||||
template <typename T1, typename T2> \
|
||||
auto operator()(const T1& x, const T2& y){ \
|
||||
return SYCL::FUNC(x, y); \
|
||||
} \
|
||||
template <typename T1, typename T2> \
|
||||
auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) { \
|
||||
return x.FUNC(y); \
|
||||
} \
|
||||
};
|
||||
|
||||
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
|
||||
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
|
||||
|
||||
|
||||
struct EqualAssignment {
|
||||
template <typename Lhs, typename Rhs>
|
||||
void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
|
||||
@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
|
||||
} \
|
||||
};
|
||||
|
||||
DECLARE_UNARY_STRUCT(abs)
|
||||
|
||||
DECLARE_UNARY_STRUCT(sqrt)
|
||||
DECLARE_UNARY_STRUCT(rsqrt)
|
||||
DECLARE_UNARY_STRUCT(square)
|
||||
DECLARE_UNARY_STRUCT(cube)
|
||||
DECLARE_UNARY_STRUCT(inverse)
|
||||
DECLARE_UNARY_STRUCT(tanh)
|
||||
DECLARE_UNARY_STRUCT(exp)
|
||||
DECLARE_UNARY_STRUCT(expm1)
|
||||
@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
|
||||
} \
|
||||
};
|
||||
|
||||
DECLARE_BINARY_STRUCT(cwiseMax)
|
||||
DECLARE_BINARY_STRUCT(cwiseMin)
|
||||
|
||||
#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
|
||||
struct op_##NAME { \
|
||||
|
@ -23,6 +23,13 @@
|
||||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef SYCL_COMPILER_IS_DPCPP
|
||||
template <typename T>
|
||||
struct cl::sycl::is_device_copyable<
|
||||
const OffByOneScalar<T>,
|
||||
std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
|
||||
#endif
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||
IndexType sizeDim1 = 100;
|
||||
|
Loading…
x
Reference in New Issue
Block a user