mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
[SYCL-2020 Support] Enabling Intel DPCPP Compiler support to Eigen
This commit is contained in:
parent
bae119bb7e
commit
b523120687
@ -505,14 +505,19 @@ endif()
|
|||||||
|
|
||||||
# add SYCL
|
# add SYCL
|
||||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||||
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
|
|
||||||
if(EIGEN_TEST_SYCL)
|
if(EIGEN_TEST_SYCL)
|
||||||
|
option(EIGEN_SYCL_DPCPP "Use the DPCPP Sycl implementation (DPCPP is default SYCL-Compiler)." ON)
|
||||||
|
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation." OFF)
|
||||||
|
option(EIGEN_SYCL_ComputeCpp "Use the DPCPP Sycl implementation." OFF)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-shorten-64-to-32 -Wno-cast-align")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy-with-user-provided-copy -Wno-unused-variable")
|
||||||
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
|
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
if(EIGEN_SYCL_TRISYCL)
|
if(EIGEN_SYCL_TRISYCL)
|
||||||
message(STATUS "Using triSYCL")
|
message(STATUS "Using triSYCL")
|
||||||
include(FindTriSYCL)
|
include(FindTriSYCL)
|
||||||
else()
|
elseif(EIGEN_SYCL_ComputeCpp)
|
||||||
message(STATUS "Using ComputeCPP SYCL")
|
message(STATUS "Using ComputeCPP SYCL")
|
||||||
include(FindComputeCpp)
|
include(FindComputeCpp)
|
||||||
set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
|
set(COMPUTECPP_DRIVER_DEFAULT_VALUE OFF)
|
||||||
@ -523,8 +528,12 @@ if(EIGEN_TEST_SYCL)
|
|||||||
"Use ComputeCpp driver instead of a 2 steps compilation"
|
"Use ComputeCpp driver instead of a 2 steps compilation"
|
||||||
${COMPUTECPP_DRIVER_DEFAULT_VALUE}
|
${COMPUTECPP_DRIVER_DEFAULT_VALUE}
|
||||||
)
|
)
|
||||||
|
else() #Default SYCL compiler is DPCPP (EIGEN_SYCL_DPCPP)
|
||||||
|
set(DPCPP_SYCL_TARGET "spir64" CACHE STRING "Defualt target for Intel CPU/GPU")
|
||||||
|
message(STATUS "Using DPCPP")
|
||||||
|
find_package(DPCPP)
|
||||||
|
add_definitions(-DSYCL_COMPILER_IS_DPCPP)
|
||||||
endif(EIGEN_SYCL_TRISYCL)
|
endif(EIGEN_SYCL_TRISYCL)
|
||||||
option(EIGEN_DONT_VECTORIZE_SYCL "Don't use vectorisation in the SYCL tests." OFF)
|
|
||||||
if(EIGEN_DONT_VECTORIZE_SYCL)
|
if(EIGEN_DONT_VECTORIZE_SYCL)
|
||||||
message(STATUS "Disabling SYCL vectorization in tests/examples")
|
message(STATUS "Disabling SYCL vectorization in tests/examples")
|
||||||
# When disabling SYCL vectorization, also disable Eigen default vectorization
|
# When disabling SYCL vectorization, also disable Eigen default vectorization
|
||||||
|
@ -82,7 +82,9 @@
|
|||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#ifndef __SYCL_DEVICE_ONLY__
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#endif
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#ifndef EIGEN_NO_IO
|
#ifndef EIGEN_NO_IO
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
@ -21,193 +21,53 @@
|
|||||||
#ifndef EIGEN_PACKET_MATH_SYCL_H
|
#ifndef EIGEN_PACKET_MATH_SYCL_H
|
||||||
#define EIGEN_PACKET_MATH_SYCL_H
|
#define EIGEN_PACKET_MATH_SYCL_H
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
#include "../../InternalHeaderCheck.h"
|
#include "../../InternalHeaderCheck.h"
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
#ifdef SYCL_DEVICE_ONLY
|
#ifdef SYCL_DEVICE_ONLY
|
||||||
|
#define SYCL_PLOAD(packet_type, AlignedType) \
|
||||||
#define SYCL_PLOADT_RO(address_space_target) \
|
template <> \
|
||||||
template <typename packet_type, int Alignment> \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \
|
pload##AlignedType<packet_type>( \
|
||||||
typename cl::sycl::multi_ptr< \
|
const typename unpacket_traits<packet_type>::type* from) { \
|
||||||
const typename unpacket_traits<packet_type>::type, \
|
using scalar = typename unpacket_traits<packet_type>::type; \
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
typedef cl::sycl::multi_ptr< \
|
||||||
from) { \
|
const scalar, cl::sycl::access::address_space::generic_space, \
|
||||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
cl::sycl::access::decorated::no> \
|
||||||
typedef cl::sycl::multi_ptr< \
|
multi_ptr; \
|
||||||
scalar, cl::sycl::access::address_space::address_space_target> \
|
packet_type res{}; \
|
||||||
multi_ptr; \
|
res.load(0, multi_ptr(from)); \
|
||||||
auto res = packet_type( \
|
return res; \
|
||||||
static_cast<typename unpacket_traits<packet_type>::type>(0)); \
|
|
||||||
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
|
|
||||||
return res; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SYCL_PLOADT_RO(global_space)
|
SYCL_PLOAD(cl::sycl::cl_float4, u)
|
||||||
SYCL_PLOADT_RO(local_space)
|
SYCL_PLOAD(cl::sycl::cl_float4, )
|
||||||
#undef SYCL_PLOADT_RO
|
SYCL_PLOAD(cl::sycl::cl_double2, u)
|
||||||
#endif
|
SYCL_PLOAD(cl::sycl::cl_double2, )
|
||||||
|
|
||||||
template <typename packet_type, int Alignment, typename T>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
|
||||||
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
|
|
||||||
cl::sycl::access::mode::read_write, T>& from) {
|
|
||||||
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef SYCL_DEVICE_ONLY
|
|
||||||
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \
|
|
||||||
template <typename packet_type> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
const typename unpacket_traits<packet_type>::type, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
from) { \
|
|
||||||
return ploadt_ro<packet_type, Alignment>(from); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// global space
|
|
||||||
SYCL_PLOAD(global_space, Unaligned, u)
|
|
||||||
SYCL_PLOAD(global_space, Aligned, )
|
|
||||||
// local space
|
|
||||||
SYCL_PLOAD(local_space, Unaligned, u)
|
|
||||||
SYCL_PLOAD(local_space, Aligned, )
|
|
||||||
|
|
||||||
#undef SYCL_PLOAD
|
#undef SYCL_PLOAD
|
||||||
#endif
|
|
||||||
|
|
||||||
#define SYCL_PLOAD(Alignment, AlignedType) \
|
#define SYCL_PSTORE(scalar, packet_type, alignment) \
|
||||||
template <typename packet_type> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||||
const Eigen::TensorSycl::internal::RangeAccess< \
|
scalar* to, const packet_type& from) { \
|
||||||
cl::sycl::access::mode::read_write, \
|
typedef cl::sycl::multi_ptr< \
|
||||||
typename unpacket_traits<packet_type>::type> \
|
scalar, cl::sycl::access::address_space::generic_space, \
|
||||||
from) { \
|
cl::sycl::access::decorated::no> \
|
||||||
return ploadt_ro<packet_type, Alignment>(from); \
|
multi_ptr; \
|
||||||
}
|
from.store(0, multi_ptr(to)); \
|
||||||
SYCL_PLOAD(Unaligned, u)
|
|
||||||
SYCL_PLOAD(Aligned, )
|
|
||||||
#undef SYCL_PLOAD
|
|
||||||
|
|
||||||
#ifdef SYCL_DEVICE_ONLY
|
|
||||||
/** \internal \returns a packet version of \a *from.
|
|
||||||
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
|
||||||
#define SYCL_PLOADT(address_space_target) \
|
|
||||||
template <typename packet_type, int Alignment> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
const typename unpacket_traits<packet_type>::type, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
from) { \
|
|
||||||
if (Alignment >= unpacket_traits<packet_type>::alignment) \
|
|
||||||
return pload<packet_type>(from); \
|
|
||||||
else \
|
|
||||||
return ploadu<packet_type>(from); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// global space
|
SYCL_PSTORE(float, cl::sycl::cl_float4, )
|
||||||
SYCL_PLOADT(global_space)
|
SYCL_PSTORE(float, cl::sycl::cl_float4, u)
|
||||||
// local space
|
SYCL_PSTORE(double, cl::sycl::cl_double2, )
|
||||||
SYCL_PLOADT(local_space)
|
SYCL_PSTORE(double, cl::sycl::cl_double2, u)
|
||||||
#undef SYCL_PLOADT
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <typename packet_type, int Alignment>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
|
||||||
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
|
|
||||||
cl::sycl::access::mode::read_write,
|
|
||||||
typename unpacket_traits<packet_type>::type>& from) {
|
|
||||||
return ploadt<packet_type, Alignment>(from.get_pointer());
|
|
||||||
}
|
|
||||||
#ifdef SYCL_DEVICE_ONLY
|
|
||||||
|
|
||||||
// private_space
|
|
||||||
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \
|
|
||||||
template <> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
|
|
||||||
ploadt_ro<packet_type, Alignment>( \
|
|
||||||
const typename unpacket_traits<packet_type>::type* from) { \
|
|
||||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
|
||||||
auto res = packet_type(static_cast<scalar>(0)); \
|
|
||||||
res.template load<cl::sycl::access::address_space::private_space>( \
|
|
||||||
0, const_cast<scalar*>(from)); \
|
|
||||||
return res; \
|
|
||||||
}
|
|
||||||
|
|
||||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
|
|
||||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
|
|
||||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
|
|
||||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
|
|
||||||
|
|
||||||
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \
|
|
||||||
template <> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
|
|
||||||
const typename unpacket_traits<packet_type>::type* from) { \
|
|
||||||
typedef typename unpacket_traits<packet_type>::type scalar; \
|
|
||||||
auto res = packet_type(static_cast<scalar>(0)); \
|
|
||||||
res.template load<cl::sycl::access::address_space::private_space>( \
|
|
||||||
0, const_cast<scalar*>(from)); \
|
|
||||||
return res; \
|
|
||||||
}
|
|
||||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
|
|
||||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
|
|
||||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
|
|
||||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
|
|
||||||
|
|
||||||
#undef SYCL_PLOAD_SPECIAL
|
|
||||||
|
|
||||||
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \
|
|
||||||
template <> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
scalar, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
to, \
|
|
||||||
const packet_type& from) { \
|
|
||||||
typedef cl::sycl::multi_ptr< \
|
|
||||||
scalar, cl::sycl::access::address_space::address_space_target> \
|
|
||||||
multi_ptr; \
|
|
||||||
from.store(0, multi_ptr(to)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// global space
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
|
|
||||||
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
|
|
||||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
|
|
||||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
|
|
||||||
#undef SYCL_PSTORE
|
#undef SYCL_PSTORE
|
||||||
|
|
||||||
#define SYCL_PSTORE_T(address_space_target) \
|
|
||||||
template <typename scalar, typename packet_type, int Alignment> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
scalar, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
to, \
|
|
||||||
const packet_type& from) { \
|
|
||||||
if (Alignment) \
|
|
||||||
pstore(to, from); \
|
|
||||||
else \
|
|
||||||
pstoreu(to, from); \
|
|
||||||
}
|
|
||||||
|
|
||||||
SYCL_PSTORE_T(global_space)
|
|
||||||
|
|
||||||
SYCL_PSTORE_T(local_space)
|
|
||||||
|
|
||||||
#undef SYCL_PSTORE_T
|
|
||||||
|
|
||||||
#define SYCL_PSET1(packet_type) \
|
#define SYCL_PSET1(packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
|
||||||
@ -291,22 +151,6 @@ struct get_base_packet<cl::sycl::cl_double2> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#define SYCL_PLOAD_DUP(address_space_target) \
|
|
||||||
template <typename packet_type> \
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
const typename unpacket_traits<packet_type>::type, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
from) { \
|
|
||||||
return get_base_packet<packet_type>::get_ploaddup(from); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// global space
|
|
||||||
SYCL_PLOAD_DUP(global_space)
|
|
||||||
// local_space
|
|
||||||
SYCL_PLOAD_DUP(local_space)
|
|
||||||
#undef SYCL_PLOAD_DUP
|
|
||||||
|
|
||||||
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \
|
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
|
||||||
@ -325,30 +169,11 @@ SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
|||||||
const typename unpacket_traits<packet_type>::type& a) { \
|
const typename unpacket_traits<packet_type>::type& a) { \
|
||||||
return get_base_packet<packet_type>::set_plset(a); \
|
return get_base_packet<packet_type>::set_plset(a); \
|
||||||
}
|
}
|
||||||
|
|
||||||
SYCL_PLSET(cl::sycl::cl_float4)
|
SYCL_PLSET(cl::sycl::cl_float4)
|
||||||
SYCL_PLSET(cl::sycl::cl_double2)
|
SYCL_PLSET(cl::sycl::cl_double2)
|
||||||
|
|
||||||
#undef SYCL_PLSET
|
#undef SYCL_PLSET
|
||||||
|
|
||||||
#define SYCL_PGATHER(address_space_target) \
|
|
||||||
template <typename Scalar, typename packet_type> \
|
|
||||||
EIGEN_DEVICE_FUNC inline packet_type pgather( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
const typename unpacket_traits<packet_type>::type, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
from, \
|
|
||||||
Index stride) { \
|
|
||||||
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// global space
|
|
||||||
SYCL_PGATHER(global_space)
|
|
||||||
// local space
|
|
||||||
SYCL_PGATHER(local_space)
|
|
||||||
|
|
||||||
#undef SYCL_PGATHER
|
|
||||||
|
|
||||||
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
|
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||||
@ -362,24 +187,6 @@ SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
|||||||
|
|
||||||
#undef SYCL_PGATHER_SPECILIZE
|
#undef SYCL_PGATHER_SPECILIZE
|
||||||
|
|
||||||
#define SYCL_PSCATTER(address_space_target) \
|
|
||||||
template <typename Scalar, typename packet_type> \
|
|
||||||
EIGEN_DEVICE_FUNC inline void pscatter( \
|
|
||||||
typename cl::sycl::multi_ptr< \
|
|
||||||
typename unpacket_traits<packet_type>::type, \
|
|
||||||
cl::sycl::access::address_space::address_space_target>::pointer_t \
|
|
||||||
to, \
|
|
||||||
const packet_type& from, Index stride) { \
|
|
||||||
get_base_packet<packet_type>::set_pscatter(to, from, stride); \
|
|
||||||
}
|
|
||||||
|
|
||||||
// global space
|
|
||||||
SYCL_PSCATTER(global_space)
|
|
||||||
// local space
|
|
||||||
SYCL_PSCATTER(local_space)
|
|
||||||
|
|
||||||
#undef SYCL_PSCATTER
|
|
||||||
|
|
||||||
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
|
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
|
||||||
template <> \
|
template <> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
|
||||||
@ -563,6 +370,34 @@ inline cl::sycl::cl_double2 pblend(
|
|||||||
}
|
}
|
||||||
#endif // SYCL_DEVICE_ONLY
|
#endif // SYCL_DEVICE_ONLY
|
||||||
|
|
||||||
|
template <typename packet_type, int Alignment, typename T>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||||
|
ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
|
||||||
|
cl::sycl::access::mode::read_write, T>& from) {
|
||||||
|
return ploadt_ro<packet_type, Alignment>(from.get_pointer());
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SYCL_PLOAD(Alignment, AlignedType) \
|
||||||
|
template <typename packet_type> \
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
|
||||||
|
const Eigen::TensorSycl::internal::RangeAccess< \
|
||||||
|
cl::sycl::access::mode::read_write, \
|
||||||
|
typename unpacket_traits<packet_type>::type> \
|
||||||
|
from) { \
|
||||||
|
return ploadt_ro<packet_type, Alignment>(from); \
|
||||||
|
}
|
||||||
|
SYCL_PLOAD(Unaligned, u)
|
||||||
|
SYCL_PLOAD(Aligned, )
|
||||||
|
#undef SYCL_PLOAD
|
||||||
|
|
||||||
|
template <typename packet_type, int Alignment>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
|
||||||
|
ploadt(const Eigen::TensorSycl::internal::RangeAccess<
|
||||||
|
cl::sycl::access::mode::read_write,
|
||||||
|
typename unpacket_traits<packet_type>::type>& from) {
|
||||||
|
return ploadt<packet_type, Alignment>(from.get_pointer());
|
||||||
|
}
|
||||||
|
|
||||||
#define SYCL_PSTORE(alignment) \
|
#define SYCL_PSTORE(alignment) \
|
||||||
template <typename packet_type> \
|
template <typename packet_type> \
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||||
|
@ -223,7 +223,6 @@ class PointerMapper {
|
|||||||
m_pointerMap.clear();
|
m_pointerMap.clear();
|
||||||
EIGEN_THROW_X(
|
EIGEN_THROW_X(
|
||||||
std::out_of_range("The pointer is not registered in the map\n"));
|
std::out_of_range("The pointer is not registered in the map\n"));
|
||||||
|
|
||||||
}
|
}
|
||||||
--node;
|
--node;
|
||||||
}
|
}
|
||||||
@ -550,7 +549,7 @@ struct RangeAccess {
|
|||||||
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
|
static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
|
||||||
typedef T scalar_t;
|
typedef T scalar_t;
|
||||||
typedef scalar_t &ref_t;
|
typedef scalar_t &ref_t;
|
||||||
typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
|
typedef scalar_t *ptr_t;
|
||||||
|
|
||||||
// the accessor type does not necessarily the same as T
|
// the accessor type does not necessarily the same as T
|
||||||
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
|
typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
|
||||||
@ -570,7 +569,12 @@ struct RangeAccess {
|
|||||||
RangeAccess(std::nullptr_t) : RangeAccess() {}
|
RangeAccess(std::nullptr_t) : RangeAccess() {}
|
||||||
// This template parameter must be removed and scalar_t should be replaced
|
// This template parameter must be removed and scalar_t should be replaced
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
|
||||||
return (access_.get_pointer().get() + offset_);
|
typedef cl::sycl::multi_ptr<scalar_t,
|
||||||
|
cl::sycl::access::address_space::generic_space,
|
||||||
|
cl::sycl::access::decorated::no>
|
||||||
|
multi_ptr;
|
||||||
|
multi_ptr p(access_);
|
||||||
|
return (p + offset_).get_raw();
|
||||||
}
|
}
|
||||||
template <typename Index>
|
template <typename Index>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
|
||||||
|
@ -64,7 +64,7 @@ pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
|
|||||||
cl::sycl::rounding_mode::automatic>();
|
cl::sycl::rounding_mode::automatic>();
|
||||||
auto b1 = b.template convert<cl::sycl::cl_float,
|
auto b1 = b.template convert<cl::sycl::cl_float,
|
||||||
cl::sycl::rounding_mode::automatic>();
|
cl::sycl::rounding_mode::automatic>();
|
||||||
return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
|
return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
|
@ -16,5 +16,11 @@ following commands:
|
|||||||
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
|
1. export COMPUTECPP_PACKAGE_ROOT_DIR={PATH TO COMPUTECPP ROOT DIRECTORY}
|
||||||
2. bash eigen_sycl_bench.sh
|
2. bash eigen_sycl_bench.sh
|
||||||
|
|
||||||
|
To compile the floating point GPU benchmarks using Intel DPCPP compiler
|
||||||
|
/path/to/dpcpp/bin/clang+ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_benchmarks_sycl.cc benchmark_main.cc -lpthread -o eigen_dpcpp_sycl
|
||||||
|
|
||||||
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
|
Last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
|
||||||
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
|
g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu
|
||||||
|
|
||||||
|
To compile the contraction with DPCPP:
|
||||||
|
/path/to/dpcpp/bin/clang++ -DSYCL_COMPILER_IS_DPCPP -DNDEBUG -DEIGEN_MPL2_ONLY -DEIGEN_USE_SYCL=1 -I ../../ -O3 -DNDEBUG -fsycl -fsycl-targets="supported backend in DPCPP. i.e. spir64 or nvptx64-nvidia-cuda" -std=c++17 tensor_contract_sycl_bench.cc -lpthread -o eigen_dpcpp_contract
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||||
#define EIGEN_TEST_NO_COMPLEX
|
#define EIGEN_TEST_NO_COMPLEX
|
||||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
|
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
|
||||||
#include <SYCL/sycl.hpp>
|
#include <CL/sycl.hpp>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
@ -56,9 +56,9 @@ void contraction(const Device& device_, TensorIndex num_iters, TensorIndex m_, T
|
|||||||
|
|
||||||
// Initialize the content of the memory pools to prevent asan from
|
// Initialize the content of the memory pools to prevent asan from
|
||||||
// complaining.
|
// complaining.
|
||||||
device_.fill(a_, m_ * k_, T(12));
|
device_.fill(a_, a_ + (m_ * k_), T(12));
|
||||||
device_.fill(b_, k_ * n_, T(23));
|
device_.fill(b_, b_ + (k_ * n_), T(23));
|
||||||
device_.fill(c_, m_ * n_, T(31));
|
device_.fill(c_, c_ + (m_ * n_), T(31));
|
||||||
|
|
||||||
Eigen::array<TensorIndex, 2> sizeA;
|
Eigen::array<TensorIndex, 2> sizeA;
|
||||||
sizeA[0] = m_;
|
sizeA[0] = m_;
|
||||||
@ -110,9 +110,9 @@ void contractionRowMajor(const Device& device_, TensorIndex num_iters, TensorInd
|
|||||||
|
|
||||||
// Initialize the content of the memory pools to prevent asan from
|
// Initialize the content of the memory pools to prevent asan from
|
||||||
// complaining.
|
// complaining.
|
||||||
device_.memset(a_, 12, m_ * k_ * sizeof(T));
|
device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
|
||||||
device_.memset(b_, 23, k_ * n_ * sizeof(T));
|
device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
|
||||||
device_.memset(c_, 31, m_ * n_ * sizeof(T));
|
device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));
|
||||||
|
|
||||||
Eigen::array<TensorIndex, 2> sizeA;
|
Eigen::array<TensorIndex, 2> sizeA;
|
||||||
sizeA[0] = m_;
|
sizeA[0] = m_;
|
||||||
|
62
cmake/FindDPCPP.cmake
Normal file
62
cmake/FindDPCPP.cmake
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
include_guard()
|
||||||
|
|
||||||
|
include(CheckCXXCompilerFlag)
|
||||||
|
include(FindPackageHandleStandardArgs)
|
||||||
|
|
||||||
|
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa" AND
|
||||||
|
"${DPCPP_SYCL_ARCH}" STREQUAL "")
|
||||||
|
message(FATAL_ERROR "Architecture required for AMD DPCPP builds,"
|
||||||
|
" please specify in DPCPP_SYCL_ARCH")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(DPCPP_USER_FLAGS "" CACHE STRING
|
||||||
|
"Additional user-specified compiler flags for DPC++")
|
||||||
|
|
||||||
|
get_filename_component(DPCPP_BIN_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
|
||||||
|
find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
|
||||||
|
|
||||||
|
add_library(DPCPP::DPCPP INTERFACE IMPORTED)
|
||||||
|
|
||||||
|
set(DPCPP_FLAGS "-fsycl;-fsycl-targets=${DPCPP_SYCL_TARGET};-fsycl-unnamed-lambda;${DPCPP_USER_FLAGS};-ftemplate-backtrace-limit=0")
|
||||||
|
if(NOT "${DPCPP_SYCL_ARCH}" STREQUAL "")
|
||||||
|
if("${DPCPP_SYCL_TARGET}" STREQUAL "amdgcn-amd-amdhsa")
|
||||||
|
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
|
||||||
|
list(APPEND DPCPP_FLAGS "--offload-arch=${DPCPP_SYCL_ARCH}")
|
||||||
|
elseif("${DPCPP_SYCL_TARGET}" STREQUAL "nvptx64-nvidia-cuda")
|
||||||
|
list(APPEND DPCPP_FLAGS "-Xsycl-target-backend")
|
||||||
|
list(APPEND DPCPP_FLAGS "--cuda-gpu-arch=${DPCPP_SYCL_ARCH}")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(UNIX)
|
||||||
|
set_target_properties(DPCPP::DPCPP PROPERTIES
|
||||||
|
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
|
||||||
|
INTERFACE_LINK_OPTIONS "${DPCPP_FLAGS}"
|
||||||
|
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl;${DPCPP_BIN_DIR}/../include")
|
||||||
|
message(STATUS ">>>>>>>>> DPCPP INCLUDE DIR: ${DPCPP_BIN_DIR}/../include/sycl")
|
||||||
|
else()
|
||||||
|
set_target_properties(DPCPP::DPCPP PROPERTIES
|
||||||
|
INTERFACE_COMPILE_OPTIONS "${DPCPP_FLAGS}"
|
||||||
|
INTERFACE_LINK_LIBRARIES ${DPCPP_LIB_DIR}
|
||||||
|
INTERFACE_INCLUDE_DIRECTORIES "${DPCPP_BIN_DIR}/../include/sycl")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
function(add_sycl_to_target)
|
||||||
|
set(options)
|
||||||
|
set(one_value_args TARGET)
|
||||||
|
set(multi_value_args SOURCES)
|
||||||
|
cmake_parse_arguments(SB_ADD_SYCL
|
||||||
|
"${options}"
|
||||||
|
"${one_value_args}"
|
||||||
|
"${multi_value_args}"
|
||||||
|
${ARGN}
|
||||||
|
)
|
||||||
|
target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
|
||||||
|
target_link_libraries(${SB_ADD_SYCL_TARGET} DPCPP::DPCPP)
|
||||||
|
target_compile_features(${SB_ADD_SYCL_TARGET} PRIVATE cxx_std_17)
|
||||||
|
get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE)
|
||||||
|
if (NOT target_type STREQUAL "OBJECT_LIBRARY")
|
||||||
|
target_link_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS})
|
||||||
|
endif()
|
||||||
|
endfunction()
|
12
test/main.h
12
test/main.h
@ -18,6 +18,9 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <typeinfo>
|
#include <typeinfo>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#ifdef EIGEN_USE_SYCL
|
||||||
|
#include <CL/sycl.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
// The following includes of STL headers have to be done _before_ the
|
// The following includes of STL headers have to be done _before_ the
|
||||||
// definition of macros min() and max(). The reason is that many STL
|
// definition of macros min() and max(). The reason is that many STL
|
||||||
@ -121,9 +124,7 @@ struct imag {};
|
|||||||
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
|
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
|
||||||
// B0 is defined in POSIX header termios.h
|
// B0 is defined in POSIX header termios.h
|
||||||
#define B0 FORBIDDEN_IDENTIFIER
|
#define B0 FORBIDDEN_IDENTIFIER
|
||||||
// `I` may be defined by complex.h:
|
|
||||||
#define I FORBIDDEN_IDENTIFIER
|
#define I FORBIDDEN_IDENTIFIER
|
||||||
|
|
||||||
// Unit tests calling Eigen's blas library must preserve the default blocking size
|
// Unit tests calling Eigen's blas library must preserve the default blocking size
|
||||||
// to avoid troubles.
|
// to avoid troubles.
|
||||||
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
|
#ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||||
@ -301,15 +302,16 @@ namespace Eigen
|
|||||||
}
|
}
|
||||||
#endif //EIGEN_EXCEPTIONS
|
#endif //EIGEN_EXCEPTIONS
|
||||||
|
|
||||||
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(SYCL_DEVICE_ONLY) // EIGEN_DEBUG_ASSERTS
|
#elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS
|
||||||
#define eigen_assert(a) \
|
#define eigen_assert(a) \
|
||||||
if( (!(a)) && (!no_more_assert) ) \
|
if( (!(a)) && (!no_more_assert) ) \
|
||||||
{ \
|
{ \
|
||||||
Eigen::no_more_assert = true; \
|
Eigen::no_more_assert = true; \
|
||||||
if(report_on_cerr_on_assert_failure) \
|
if(report_on_cerr_on_assert_failure) { \
|
||||||
eigen_plain_assert(a); \
|
eigen_plain_assert(a); \
|
||||||
else \
|
} else { \
|
||||||
EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
|
EIGEN_THROW_X(Eigen::eigen_assert_exception()); \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef EIGEN_EXCEPTIONS
|
#ifdef EIGEN_EXCEPTIONS
|
||||||
|
@ -170,10 +170,10 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
|
|||||||
m_rightImpl.cleanup();
|
m_rightImpl.cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
|
||||||
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
|
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
|
||||||
}
|
}
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
|
||||||
|
|
||||||
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
|
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||||
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
|
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
|
||||||
|
@ -1185,7 +1185,6 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
|
|||||||
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
|
||||||
return derived();
|
return derived();
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
|
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
@ -1195,3 +1194,4 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
|
|||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
|
||||||
|
|
||||||
|
@ -438,13 +438,13 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
|
|||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
if (this->isInnerChipping()) {
|
if (this->isInnerChipping()) {
|
||||||
// m_stride is equal to 1, so let's avoid the integer division.
|
// m_stride is equal to 1, so let's avoid the integer division.
|
||||||
|
@ -331,7 +331,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
|||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
// Collect dimension-wise indices (subs).
|
// Collect dimension-wise indices (subs).
|
||||||
array<Index, Base::NumDims> subs;
|
array<Index, Base::NumDims> subs;
|
||||||
@ -360,7 +360,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
const int packetSize = PacketType<CoeffReturnType, Device>::size;
|
const int packetSize = PacketType<CoeffReturnType, Device>::size;
|
||||||
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
|
@ -362,10 +362,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
|
|||||||
const OutputKernelType m_output_kernel;
|
const OutputKernelType m_output_kernel;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename Derived>
|
template<typename Derived>
|
||||||
struct TensorContractionEvaluatorBase : internal::no_assignment_operator
|
struct TensorContractionEvaluatorBase {
|
||||||
{
|
|
||||||
typedef typename internal::traits<Derived>::Indices Indices;
|
typedef typename internal::traits<Derived>::Indices Indices;
|
||||||
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
|
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
|
||||||
typedef typename internal::traits<Derived>::RightArgType RightArgType;
|
typedef typename internal::traits<Derived>::RightArgType RightArgType;
|
||||||
|
@ -597,7 +597,7 @@ class TensorContractionKernel {
|
|||||||
const TripleDim triple_dim_)
|
const TripleDim triple_dim_)
|
||||||
: TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
|
: TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
||||||
const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
|
const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
|
||||||
const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
|
const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
|
||||||
@ -636,7 +636,7 @@ class TensorContractionKernel {
|
|||||||
// privateRes memory of Each computation the compute block function is independent of local and no local concepts as
|
// privateRes memory of Each computation the compute block function is independent of local and no local concepts as
|
||||||
// it only compute the block on each thread's private memory space
|
// it only compute the block on each thread's private memory space
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
|
||||||
PacketReturnType *privateRes) {
|
PacketReturnType *privateRes) const {
|
||||||
StorageIndex idx = 0;
|
StorageIndex idx = 0;
|
||||||
EIGEN_CONSTEXPR StorageIndex lhs_stride =
|
EIGEN_CONSTEXPR StorageIndex lhs_stride =
|
||||||
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
|
contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
|
||||||
@ -661,7 +661,7 @@ class TensorContractionKernel {
|
|||||||
// class.
|
// class.
|
||||||
template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
|
template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
|
||||||
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
|
StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
|
||||||
auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
|
auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
|
||||||
return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
|
return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
|
||||||
};
|
};
|
||||||
@ -713,7 +713,7 @@ class TensorContractionKernel {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
std::enable_if_t<contract_tp == contraction_type::no_local>
|
std::enable_if_t<contract_tp == contraction_type::no_local>
|
||||||
extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
|
extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
|
||||||
const StorageIndex &ncOffset, const StorageIndex cOffset) {
|
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
|
||||||
EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
|
EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
|
||||||
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
|
InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
|
||||||
EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
|
EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
|
||||||
@ -833,7 +833,8 @@ class TensorContractionKernel {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
|
||||||
ThreadProperties<StorageIndex> &thread_properties,
|
ThreadProperties<StorageIndex> &thread_properties,
|
||||||
TiledMemory &tiled_input_block,
|
TiledMemory &tiled_input_block,
|
||||||
PacketReturnType *privateRes, bool &db_offset) {
|
PacketReturnType *privateRes, bool &db_offset) const {
|
||||||
|
|
||||||
// Tiling the Rhs block from global to local memory
|
// Tiling the Rhs block from global to local memory
|
||||||
extract_block<RHSBlockProperties, is_internal_block>(
|
extract_block<RHSBlockProperties, is_internal_block>(
|
||||||
rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
|
rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
|
||||||
@ -871,7 +872,7 @@ class TensorContractionKernel {
|
|||||||
template <bool is_internal_block, typename OutPtr>
|
template <bool is_internal_block, typename OutPtr>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
|
||||||
ThreadProperties<StorageIndex> &thread_properties,
|
ThreadProperties<StorageIndex> &thread_properties,
|
||||||
OutPtr out_ptr) {
|
OutPtr out_ptr) const {
|
||||||
auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
|
auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
|
||||||
// Allocate register space
|
// Allocate register space
|
||||||
PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
|
PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
|
||||||
@ -897,7 +898,7 @@ class TensorContractionKernel {
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
std::enable_if_t<contract_tp == contraction_type::local>
|
std::enable_if_t<contract_tp == contraction_type::local>
|
||||||
extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
|
extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
|
||||||
const StorageIndex &ncOffset, const StorageIndex cOffset) {
|
const StorageIndex &ncOffset, const StorageIndex cOffset) const {
|
||||||
EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
|
EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
|
||||||
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
|
InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
|
||||||
EIGEN_CONSTEXPR StorageIndex LoadPerThread =
|
EIGEN_CONSTEXPR StorageIndex LoadPerThread =
|
||||||
@ -1035,7 +1036,7 @@ struct GeneralVectorTensor {
|
|||||||
nonContractDim(nonContractDim_),
|
nonContractDim(nonContractDim_),
|
||||||
contractDim(contractDim_) {}
|
contractDim(contractDim_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
auto scratch_ptr = scratch.get_pointer();
|
auto scratch_ptr = scratch.get_pointer();
|
||||||
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
|
||||||
StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
|
StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
|
||||||
@ -1252,7 +1253,8 @@ struct GeneralScalarContraction {
|
|||||||
const StorageIndex rng_)
|
const StorageIndex rng_)
|
||||||
: scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
|
: scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
|
|
||||||
auto out_ptr = out_res.get_pointer();
|
auto out_ptr = out_res.get_pointer();
|
||||||
auto scratch_ptr = scratch.get_pointer().get();
|
auto scratch_ptr = scratch.get_pointer().get();
|
||||||
|
|
||||||
|
@ -57,10 +57,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
|||||||
input_range(input_range_) {}
|
input_range(input_range_) {}
|
||||||
|
|
||||||
template <typename BooleanDim2>
|
template <typename BooleanDim2>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
|
||||||
return (boolean_check[0] && boolean_check[1]);
|
return (boolean_check[0] && boolean_check[1]);
|
||||||
}
|
}
|
||||||
void operator()(cl::sycl::nd_item<2> itemID) {
|
void operator()(cl::sycl::nd_item<2> itemID) const {
|
||||||
auto buffer_ptr = buffer_acc.get_pointer();
|
auto buffer_ptr = buffer_acc.get_pointer();
|
||||||
auto kernel_ptr = kernel_filter.get_pointer();
|
auto kernel_ptr = kernel_filter.get_pointer();
|
||||||
// the required row to be calculated for the for each plane in shered memory
|
// the required row to be calculated for the for each plane in shered memory
|
||||||
@ -123,11 +123,11 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
|||||||
kernel_size(kernel_size_),
|
kernel_size(kernel_size_),
|
||||||
input_range(input_range_) {}
|
input_range(input_range_) {}
|
||||||
template <typename BooleanDim3>
|
template <typename BooleanDim3>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
|
||||||
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||||
auto buffer_ptr = buffer_acc.get_pointer();
|
auto buffer_ptr = buffer_acc.get_pointer();
|
||||||
auto kernel_ptr = kernel_filter.get_pointer();
|
auto kernel_ptr = kernel_filter.get_pointer();
|
||||||
// the required row to be calculated for the for each plane in shered memory
|
// the required row to be calculated for the for each plane in shered memory
|
||||||
@ -212,10 +212,10 @@ struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, Inp
|
|||||||
input_range(input_range_),
|
input_range(input_range_),
|
||||||
numP(numP_) {}
|
numP(numP_) {}
|
||||||
template <typename BooleanDim3>
|
template <typename BooleanDim3>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
|
||||||
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
|
||||||
}
|
}
|
||||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
void operator()(cl::sycl::nd_item<3> itemID) const {
|
||||||
auto buffer_ptr = buffer_acc.get_pointer();
|
auto buffer_ptr = buffer_acc.get_pointer();
|
||||||
auto kernel_ptr = kernel_filter.get_pointer();
|
auto kernel_ptr = kernel_filter.get_pointer();
|
||||||
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
|
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
|
||||||
|
@ -31,8 +31,7 @@ struct SyclDeviceInfo {
|
|||||||
.template get_info<cl::sycl::info::device::local_mem_type>()),
|
.template get_info<cl::sycl::info::device::local_mem_type>()),
|
||||||
max_work_item_sizes(
|
max_work_item_sizes(
|
||||||
queue.get_device()
|
queue.get_device()
|
||||||
.template get_info<
|
.template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
|
||||||
cl::sycl::info::device::max_work_item_sizes>()),
|
|
||||||
max_mem_alloc_size(
|
max_mem_alloc_size(
|
||||||
queue.get_device()
|
queue.get_device()
|
||||||
.template get_info<
|
.template get_info<
|
||||||
@ -356,7 +355,7 @@ class QueueInterface {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const ptrdiff_t count = end - begin;
|
const ptrdiff_t count = end - begin;
|
||||||
auto f = [&](cl::sycl::handler &cgh) {
|
auto f = [&](cl::sycl::handler &cgh) {
|
||||||
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
|
auto dst_acc = get_typed_range_accessor<write_mode, T>(cgh, begin, count);
|
||||||
cgh.fill(dst_acc, value);
|
cgh.fill(dst_acc, value);
|
||||||
};
|
};
|
||||||
|
@ -159,10 +159,10 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
|
||||||
m_buffer[i] = m_impl.coeff(i);
|
m_buffer[i] = m_impl.coeff(i);
|
||||||
}
|
}
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
|
||||||
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +98,7 @@ struct TensorEvaluator
|
|||||||
return m_data[index];
|
return m_data[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const{
|
||||||
eigen_assert(m_data != NULL);
|
eigen_assert(m_data != NULL);
|
||||||
return m_data[index];
|
return m_data[index];
|
||||||
}
|
}
|
||||||
@ -122,7 +122,7 @@ struct TensorEvaluator
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
|
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
|
||||||
}
|
}
|
||||||
@ -137,7 +137,7 @@ struct TensorEvaluator
|
|||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
|
||||||
coeffRef(const array<DenseIndex, NumCoords>& coords) {
|
coeffRef(const array<DenseIndex, NumCoords>& coords) const {
|
||||||
eigen_assert(m_data != NULL);
|
eigen_assert(m_data != NULL);
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
return m_data[m_dims.IndexOfColMajor(coords)];
|
return m_data[m_dims.IndexOfColMajor(coords)];
|
||||||
@ -978,7 +978,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
|||||||
TensorEvaluator<ElseArgType, Device> m_elseImpl;
|
TensorEvaluator<ElseArgType, Device> m_elseImpl;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
|
||||||
|
#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
|
||||||
|
template <typename Derived, typename Device>
|
||||||
|
struct cl::sycl::is_device_copyable<
|
||||||
|
Eigen::TensorEvaluator<Derived, Device>,
|
||||||
|
std::enable_if_t<!std::is_trivially_copyable<
|
||||||
|
Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
|
||||||
|
@ -688,12 +688,12 @@ struct ExecExprFunctorKernel {
|
|||||||
: evaluator(evaluator_), range(range_) {}
|
: evaluator(evaluator_), range(range_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
|
||||||
cl::sycl::nd_item<1> itemID) {
|
cl::sycl::nd_item<1> itemID) const {
|
||||||
compute(itemID);
|
compute(itemID);
|
||||||
}
|
}
|
||||||
template <bool is_vec = Evaluator::PacketAccess>
|
template <bool is_vec = Evaluator::PacketAccess>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
|
||||||
compute(const cl::sycl::nd_item<1>& itemID) {
|
compute(const cl::sycl::nd_item<1>& itemID) const {
|
||||||
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
||||||
Index total_threads = itemID.get_global_range(0);
|
Index total_threads = itemID.get_global_range(0);
|
||||||
|
|
||||||
@ -703,7 +703,7 @@ struct ExecExprFunctorKernel {
|
|||||||
}
|
}
|
||||||
template <bool is_vec = Evaluator::PacketAccess>
|
template <bool is_vec = Evaluator::PacketAccess>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
|
||||||
compute(const cl::sycl::nd_item<1>& itemID) {
|
compute(const cl::sycl::nd_item<1>& itemID) const {
|
||||||
const Index vectorizedRange =
|
const Index vectorizedRange =
|
||||||
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
|
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
|
||||||
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
Index gId = static_cast<Index>(itemID.get_global_linear_id());
|
||||||
|
@ -202,12 +202,12 @@ template<typename ArgType, typename Device>
|
|||||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(index);
|
return this->m_impl.coeffRef(index);
|
||||||
}
|
}
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||||
}
|
}
|
||||||
|
@ -267,13 +267,13 @@ template<typename NewDimensions, typename ArgType, typename Device>
|
|||||||
TensorBlockDesc;
|
TensorBlockDesc;
|
||||||
//===--------------------------------------------------------------------===//
|
//===--------------------------------------------------------------------===//
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(index);
|
return this->m_impl.coeffRef(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||||
}
|
}
|
||||||
@ -733,7 +733,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
if (this->m_is_identity) {
|
if (this->m_is_identity) {
|
||||||
return this->m_impl.coeffRef(index);
|
return this->m_impl.coeffRef(index);
|
||||||
@ -743,7 +743,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
if (this->m_is_identity) {
|
if (this->m_is_identity) {
|
||||||
this->m_impl.template writePacket<StoreMode>(index, x);
|
this->m_impl.template writePacket<StoreMode>(index, x);
|
||||||
@ -1085,7 +1085,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
|
|||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
typedef Strides Dimensions;
|
typedef Strides Dimensions;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
if (this->m_is_identity) {
|
if (this->m_is_identity) {
|
||||||
return this->m_impl.coeffRef(index);
|
return this->m_impl.coeffRef(index);
|
||||||
|
@ -895,7 +895,7 @@ static constexpr bool RunningOnGPU = false;
|
|||||||
// binding placeholder accessors to a command group handler for SYCL
|
// binding placeholder accessors to a command group handler for SYCL
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
|
||||||
m_impl.bind(cgh);
|
m_impl.bind(cgh);
|
||||||
m_result.bind(cgh);
|
if(m_result) m_result.bind(cgh);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ struct SecondStepFullReducer {
|
|||||||
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
|
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
|
||||||
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
|
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
|
||||||
|
|
||||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
// Our empirical research shows that the best performance will be achieved
|
// Our empirical research shows that the best performance will be achieved
|
||||||
// when there is only one element per thread to reduce in the second step.
|
// when there is only one element per thread to reduce in the second step.
|
||||||
// in this step the second step reduction time is almost negligible.
|
// in this step the second step reduction time is almost negligible.
|
||||||
@ -141,11 +141,11 @@ class FullReductionKernelFunctor {
|
|||||||
Index rng_, OpType op_)
|
Index rng_, OpType op_)
|
||||||
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
|
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
|
||||||
|
|
||||||
void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
|
void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
|
||||||
|
|
||||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
|
||||||
const cl::sycl::nd_item<1> &itemID) {
|
const cl::sycl::nd_item<1> &itemID) const {
|
||||||
auto output_ptr = final_output.get_pointer();
|
auto output_ptr = final_output.get_pointer();
|
||||||
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
|
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
|
||||||
Index globalid = itemID.get_global_id(0);
|
Index globalid = itemID.get_global_id(0);
|
||||||
@ -184,7 +184,7 @@ class FullReductionKernelFunctor {
|
|||||||
|
|
||||||
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
|
||||||
const cl::sycl::nd_item<1> &itemID) {
|
const cl::sycl::nd_item<1> &itemID) const {
|
||||||
auto output_ptr = final_output.get_pointer();
|
auto output_ptr = final_output.get_pointer();
|
||||||
Index globalid = itemID.get_global_id(0);
|
Index globalid = itemID.get_global_id(0);
|
||||||
Index localid = itemID.get_local_id(0);
|
Index localid = itemID.get_local_id(0);
|
||||||
@ -228,14 +228,16 @@ class GenericNondeterministicReducer {
|
|||||||
range(range_),
|
range(range_),
|
||||||
num_values_to_reduce(num_values_to_reduce_) {}
|
num_values_to_reduce(num_values_to_reduce_) {}
|
||||||
|
|
||||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
|
//This is to bypass the statefull condition in Eigen meanReducer
|
||||||
|
Op non_const_functor;
|
||||||
|
std::memcpy(&non_const_functor, &functor, sizeof (Op));
|
||||||
auto output_accessor_ptr = output_accessor.get_pointer();
|
auto output_accessor_ptr = output_accessor.get_pointer();
|
||||||
/// const cast added as a naive solution to solve the qualifier drop error
|
|
||||||
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
|
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
|
||||||
if (globalid < range) {
|
if (globalid < range) {
|
||||||
CoeffReturnType accum = functor.initialize();
|
CoeffReturnType accum = functor.initialize();
|
||||||
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
|
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
|
||||||
evaluator, evaluator.firstInput(globalid), functor, &accum);
|
evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
|
||||||
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
|
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -281,7 +283,7 @@ struct PartialReductionKernel {
|
|||||||
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
|
||||||
CoeffReturnType &accumulator) {
|
CoeffReturnType &accumulator) const {
|
||||||
if (globalPId >= num_coeffs_to_preserve) {
|
if (globalPId >= num_coeffs_to_preserve) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -298,7 +300,7 @@ struct PartialReductionKernel {
|
|||||||
global_offset += per_thread_global_stride;
|
global_offset += per_thread_global_stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
const Index linearLocalThreadId = itemID.get_local_id(0);
|
const Index linearLocalThreadId = itemID.get_local_id(0);
|
||||||
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
|
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
|
||||||
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
|
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
|
||||||
@ -380,7 +382,7 @@ struct SecondStepPartialReduction {
|
|||||||
num_coeffs_to_preserve(num_coeffs_to_preserve_),
|
num_coeffs_to_preserve(num_coeffs_to_preserve_),
|
||||||
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
const Index globalId = itemID.get_global_id(0);
|
const Index globalId = itemID.get_global_id(0);
|
||||||
|
|
||||||
if (globalId >= num_coeffs_to_preserve) return;
|
if (globalId >= num_coeffs_to_preserve) return;
|
||||||
|
@ -441,12 +441,12 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
|||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
|
||||||
return this->m_impl.coeffRef(this->reverseIndex(index));
|
return this->m_impl.coeffRef(this->reverseIndex(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x) {
|
void writePacket(Index index, const PacketReturnType& x) const {
|
||||||
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
|
||||||
|
|
||||||
// This code is pilfered from TensorMorphing.h
|
// This code is pilfered from TensorMorphing.h
|
||||||
|
@ -109,28 +109,28 @@ struct ScanKernelFunctor {
|
|||||||
template <scan_step sst = stp, typename Input>
|
template <scan_step sst = stp, typename Input>
|
||||||
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
||||||
EIGEN_STRONG_INLINE
|
EIGEN_STRONG_INLINE
|
||||||
read(const Input &inpt, Index global_id) {
|
read(const Input &inpt, Index global_id) const {
|
||||||
return inpt.coeff(global_id);
|
return inpt.coeff(global_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <scan_step sst = stp, typename Input>
|
template <scan_step sst = stp, typename Input>
|
||||||
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC
|
||||||
EIGEN_STRONG_INLINE
|
EIGEN_STRONG_INLINE
|
||||||
read(const Input &inpt, Index global_id) {
|
read(const Input &inpt, Index global_id) const {
|
||||||
return inpt[global_id];
|
return inpt[global_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <scan_step sst = stp, typename InclusiveOp>
|
template <scan_step sst = stp, typename InclusiveOp>
|
||||||
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
first_step_inclusive_Operation(InclusiveOp inclusive_op) {
|
first_step_inclusive_Operation(InclusiveOp inclusive_op) const {
|
||||||
inclusive_op();
|
inclusive_op();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <scan_step sst = stp, typename InclusiveOp>
|
template <scan_step sst = stp, typename InclusiveOp>
|
||||||
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
first_step_inclusive_Operation(InclusiveOp) {}
|
first_step_inclusive_Operation(InclusiveOp) const {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
auto out_ptr = out_accessor.get_pointer();
|
auto out_ptr = out_accessor.get_pointer();
|
||||||
auto tmp_ptr = temp_accessor.get_pointer();
|
auto tmp_ptr = temp_accessor.get_pointer();
|
||||||
auto scratch_ptr = scratch.get_pointer().get();
|
auto scratch_ptr = scratch.get_pointer().get();
|
||||||
@ -307,7 +307,7 @@ struct ScanAdjustmentKernelFunctor {
|
|||||||
scanParameters(scanParameters_),
|
scanParameters(scanParameters_),
|
||||||
accumulator(accumulator_) {}
|
accumulator(accumulator_) {}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
|
||||||
auto in_ptr = in_accessor.get_pointer();
|
auto in_ptr = in_accessor.get_pointer();
|
||||||
auto out_ptr = out_accessor.get_pointer();
|
auto out_ptr = out_accessor.get_pointer();
|
||||||
|
|
||||||
@ -473,7 +473,7 @@ struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
|
|||||||
typedef typename Self::CoeffReturnType CoeffReturnType;
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
||||||
typedef typename Self::Storage Storage;
|
typedef typename Self::Storage Storage;
|
||||||
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
|
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
|
||||||
void operator()(Self &self, EvaluatorPointerType data) {
|
void operator()(Self &self, EvaluatorPointerType data) const {
|
||||||
const Index total_size = internal::array_prod(self.dimensions());
|
const Index total_size = internal::array_prod(self.dimensions());
|
||||||
const Index scan_size = self.size();
|
const Index scan_size = self.size();
|
||||||
const Index scan_stride = self.stride();
|
const Index scan_stride = self.stride();
|
||||||
|
@ -390,13 +390,13 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
: Base(op, device)
|
: Base(op, device)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
|
EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
|
||||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||||
|
@ -288,13 +288,13 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
|||||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||||
static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const
|
||||||
{
|
{
|
||||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
void writePacket(Index index, const PacketReturnType& x)
|
void writePacket(Index index, const PacketReturnType& x) const
|
||||||
{
|
{
|
||||||
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||||
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
|
||||||
|
@ -2,9 +2,8 @@ FILE(GLOB examples_SRCS "*.cpp")
|
|||||||
|
|
||||||
set(EIGEN_SYCL ON)
|
set(EIGEN_SYCL ON)
|
||||||
list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
|
list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
|
||||||
if(EIGEN_SYCL_TRISYCL)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
if(EIGEN_SYCL_ComputeCpp)
|
||||||
else(EIGEN_SYCL_TRISYCL)
|
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
||||||
else()
|
else()
|
||||||
@ -22,7 +21,7 @@ else(EIGEN_SYCL_TRISYCL)
|
|||||||
-no-serial-memop
|
-no-serial-memop
|
||||||
-Xclang
|
-Xclang
|
||||||
-cl-mad-enable)
|
-cl-mad-enable)
|
||||||
endif(EIGEN_SYCL_TRISYCL)
|
endif(EIGEN_SYCL_ComputeCpp)
|
||||||
|
|
||||||
FOREACH(example_src ${examples_SRCS})
|
FOREACH(example_src ${examples_SRCS})
|
||||||
GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
|
GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
|
||||||
|
@ -122,6 +122,7 @@ ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1")
|
|||||||
|
|
||||||
if(EIGEN_TEST_SYCL)
|
if(EIGEN_TEST_SYCL)
|
||||||
set(EIGEN_SYCL ON)
|
set(EIGEN_SYCL ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
# Forward CMake options as preprocessor definitions
|
# Forward CMake options as preprocessor definitions
|
||||||
if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
|
if(EIGEN_SYCL_USE_DEFAULT_SELECTOR)
|
||||||
add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
|
add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR})
|
||||||
@ -172,10 +173,7 @@ if(EIGEN_TEST_SYCL)
|
|||||||
add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
|
add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(EIGEN_SYCL_TRISYCL)
|
if(EIGEN_SYCL_ComputeCpp)
|
||||||
# triSYCL now requires c++17.
|
|
||||||
set(CMAKE_CXX_STANDARD 17)
|
|
||||||
else()
|
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
list(APPEND COMPUTECPP_USER_FLAGS -DWIN32)
|
||||||
else()
|
else()
|
||||||
@ -193,7 +191,7 @@ if(EIGEN_TEST_SYCL)
|
|||||||
-no-serial-memop
|
-no-serial-memop
|
||||||
-Xclang
|
-Xclang
|
||||||
-cl-mad-enable)
|
-cl-mad-enable)
|
||||||
endif()
|
endif(EIGEN_SYCL_ComputeCpp)
|
||||||
|
|
||||||
ei_add_test(cxx11_tensor_sycl)
|
ei_add_test(cxx11_tensor_sycl)
|
||||||
ei_add_test(cxx11_tensor_image_op_sycl)
|
ei_add_test(cxx11_tensor_image_op_sycl)
|
||||||
@ -409,4 +407,3 @@ if (EIGEN_TEST_HIP)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -27,17 +27,64 @@ using Eigen::TensorMap;
|
|||||||
|
|
||||||
// Functions used to compare the TensorMap implementation on the device with
|
// Functions used to compare the TensorMap implementation on the device with
|
||||||
// the equivalent on the host
|
// the equivalent on the host
|
||||||
namespace cl {
|
namespace SYCL {
|
||||||
namespace sycl {
|
template <typename T> T abs(T x) {
|
||||||
template <typename T> T abs(T x) { return cl::sycl::fabs(x); }
|
return cl::sycl::abs(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> float abs(float x) {
|
||||||
|
return cl::sycl::fabs(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <> double abs(double x) {
|
||||||
|
return cl::sycl::fabs(x);
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T> T square(T x) { return x * x; }
|
template <typename T> T square(T x) { return x * x; }
|
||||||
template <typename T> T cube(T x) { return x * x * x; }
|
template <typename T> T cube(T x) { return x * x * x; }
|
||||||
template <typename T> T inverse(T x) { return T(1) / x; }
|
template <typename T> T inverse(T x) { return T(1) / x; }
|
||||||
template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); }
|
template <typename T> T cwiseMax(T x, T y) {
|
||||||
template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); }
|
return cl::sycl::max(x, y);
|
||||||
|
}
|
||||||
|
template <typename T> T cwiseMin(T x, T y) {
|
||||||
|
return cl::sycl::min(x, y);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define DECLARE_UNARY_STRUCT_NON_SYCL(FUNC) \
|
||||||
|
struct op_##FUNC { \
|
||||||
|
template <typename T> \
|
||||||
|
auto operator()(const T& x) { \
|
||||||
|
return SYCL::FUNC(x); \
|
||||||
|
} \
|
||||||
|
template <typename T> \
|
||||||
|
auto operator()(const TensorMap<T>& x) { \
|
||||||
|
return x.FUNC(); \
|
||||||
|
} \
|
||||||
|
};
|
||||||
|
|
||||||
|
DECLARE_UNARY_STRUCT_NON_SYCL(abs)
|
||||||
|
DECLARE_UNARY_STRUCT_NON_SYCL(square)
|
||||||
|
DECLARE_UNARY_STRUCT_NON_SYCL(cube)
|
||||||
|
DECLARE_UNARY_STRUCT_NON_SYCL(inverse)
|
||||||
|
|
||||||
|
#define DECLARE_BINARY_STRUCT_NON_SYCL(FUNC) \
|
||||||
|
struct op_##FUNC { \
|
||||||
|
template <typename T1, typename T2> \
|
||||||
|
auto operator()(const T1& x, const T2& y){ \
|
||||||
|
return SYCL::FUNC(x, y); \
|
||||||
|
} \
|
||||||
|
template <typename T1, typename T2> \
|
||||||
|
auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) { \
|
||||||
|
return x.FUNC(y); \
|
||||||
|
} \
|
||||||
|
};
|
||||||
|
|
||||||
|
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMax)
|
||||||
|
DECLARE_BINARY_STRUCT_NON_SYCL(cwiseMin)
|
||||||
|
|
||||||
|
|
||||||
struct EqualAssignment {
|
struct EqualAssignment {
|
||||||
template <typename Lhs, typename Rhs>
|
template <typename Lhs, typename Rhs>
|
||||||
void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
|
void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; }
|
||||||
@ -119,12 +166,9 @@ void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device,
|
|||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
DECLARE_UNARY_STRUCT(abs)
|
|
||||||
DECLARE_UNARY_STRUCT(sqrt)
|
DECLARE_UNARY_STRUCT(sqrt)
|
||||||
DECLARE_UNARY_STRUCT(rsqrt)
|
DECLARE_UNARY_STRUCT(rsqrt)
|
||||||
DECLARE_UNARY_STRUCT(square)
|
|
||||||
DECLARE_UNARY_STRUCT(cube)
|
|
||||||
DECLARE_UNARY_STRUCT(inverse)
|
|
||||||
DECLARE_UNARY_STRUCT(tanh)
|
DECLARE_UNARY_STRUCT(tanh)
|
||||||
DECLARE_UNARY_STRUCT(exp)
|
DECLARE_UNARY_STRUCT(exp)
|
||||||
DECLARE_UNARY_STRUCT(expm1)
|
DECLARE_UNARY_STRUCT(expm1)
|
||||||
@ -288,8 +332,6 @@ void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device,
|
|||||||
} \
|
} \
|
||||||
};
|
};
|
||||||
|
|
||||||
DECLARE_BINARY_STRUCT(cwiseMax)
|
|
||||||
DECLARE_BINARY_STRUCT(cwiseMin)
|
|
||||||
|
|
||||||
#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
|
#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \
|
||||||
struct op_##NAME { \
|
struct op_##NAME { \
|
||||||
|
@ -23,6 +23,13 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
#ifdef SYCL_COMPILER_IS_DPCPP
|
||||||
|
template <typename T>
|
||||||
|
struct cl::sycl::is_device_copyable<
|
||||||
|
const OffByOneScalar<T>,
|
||||||
|
std::enable_if_t<!std::is_trivially_copyable<const OffByOneScalar<T>>::value>> : std::true_type {};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename DataType, int DataLayout, typename IndexType>
|
template <typename DataType, int DataLayout, typename IndexType>
|
||||||
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
void test_device_memory(const Eigen::SyclDevice &sycl_device) {
|
||||||
IndexType sizeDim1 = 100;
|
IndexType sizeDim1 = 100;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user