diff --git a/CMakeLists.txt b/CMakeLists.txt index 8058e1ed2..8835ee928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -431,6 +431,13 @@ else() add_subdirectory(lapack EXCLUDE_FROM_ALL) endif() +# add SYCL +option(EIGEN_TEST_SYCL "Add Sycl support." OFF) +if(EIGEN_TEST_SYCL) + set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") + include(FindComputeCpp) +endif() + add_subdirectory(unsupported) add_subdirectory(demos EXCLUDE_FROM_ALL) diff --git a/Eigen/Core b/Eigen/Core index d4274770e..c82cddc5c 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -14,9 +14,19 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" -// Handle NVCC/CUDA -#ifdef __CUDACC__ - // Do not try asserts on CUDA! +/// This will no longer be needed after the next release of the computecppCE +#ifdef EIGEN_USE_SYCL +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include +#endif + +// Handle NVCC/CUDA/SYCL +#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) + // Do not try asserts on CUDA and SYCL! #ifndef EIGEN_NO_DEBUG #define EIGEN_NO_DEBUG #endif @@ -25,7 +35,7 @@ #undef EIGEN_INTERNAL_DEBUGGING #endif - // Do not try to vectorize on CUDA! + // Do not try to vectorize on CUDA and SYCL! #ifndef EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE #endif @@ -35,11 +45,14 @@ #endif // All functions callable from CUDA code must be qualified with __device__ - #define EIGEN_DEVICE_FUNC __host__ __device__ - - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include + #ifdef __CUDACC__ + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + #else + #define EIGEN_DEVICE_FUNC + #endif #else #define EIGEN_DEVICE_FUNC @@ -55,7 +68,7 @@ #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; #endif -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) #define EIGEN_EXCEPTIONS #endif diff --git a/bench/tensors/README b/bench/tensors/README index 803cb8ef8..3a5fdbe17 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -11,5 +11,11 @@ nvcc tensor_benchmarks_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBU We also provide a version of the generic GPU tensor benchmarks that uses half floats (aka fp16) instead of regular floats. To compile these benchmarks, simply call the command line below. You'll need a recent GPU that supports compute capability 5.3 or higher to run them and nvcc 7.5 or higher to compile the code. nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -DNDEBUG -use_fast_math -ftz=true -arch compute_53 -o benchmarks_fp16_gpu -last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call +last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu + +To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code): +1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. +{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc +2. The host compilation pass that generates the final host binary. +clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc new file mode 100644 index 000000000..7eca4d966 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -0,0 +1,37 @@ +#define EIGEN_USE_SYCL + +#include +#include + +#include "tensor_benchmarks.h" + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +// Simple functions +template +cl::sycl::queue sycl_queue() { + return cl::sycl::queue(device_selector(), [=](cl::sycl::exception_list l) { + for (const auto& e : l) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + } + }); +} + +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::queue q = sycl_queue(); \ + Eigen::SyclDevice device(q); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC, 10, 5000); + +BM_FuncGPU(broadcasting); +BM_FuncGPU(coeffWiseOp); diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 57866d865..602ab5271 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -109,6 +109,103 @@ macro(ei_add_test_internal testname testname_with_suffix) endmacro(ei_add_test_internal) +# SYCL +macro(ei_add_test_internal_sycl testname testname_with_suffix) + include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) + set(targetname ${testname_with_suffix}) + + if(EIGEN_ADD_TEST_FILENAME_EXTENSION) + set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) + else() + set(filename ${testname}.cpp) + endif() + + set( include_file ${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}) + set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename}) + set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename}) + + ADD_CUSTOM_COMMAND( + OUTPUT ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file} + DEPENDS ${filename} + COMMENT "Building ComputeCpp integration header file ${include_file}" + ) + # Add a custom target for the generated integration header + add_custom_target(${testname}_integration_header_woho DEPENDS ${include_file}) + + add_executable(${targetname} ${include_file}) + add_dependencies(${targetname} ${testname}_integration_header_woho) + add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR}) + + if (targetname MATCHES "^eigen2_") + add_dependencies(eigen2_buildtests ${targetname}) + else() + add_dependencies(buildtests ${targetname}) + endif() + + if(EIGEN_NO_ASSERTION_CHECKING) + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_NO_ASSERTION_CHECKING=1") + else(EIGEN_NO_ASSERTION_CHECKING) + if(EIGEN_DEBUG_ASSERTS) + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_DEBUG_ASSERTS=1") + endif(EIGEN_DEBUG_ASSERTS) + endif(EIGEN_NO_ASSERTION_CHECKING) + + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") + + ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}") + + if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS) + ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") + endif() + + # let the user pass flags. + if(${ARGC} GREATER 2) + ei_add_target_property(${targetname} COMPILE_FLAGS "${ARGV2}") + endif(${ARGC} GREATER 2) + + if(EIGEN_TEST_CUSTOM_CXX_FLAGS) + ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + endif() + + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${targetname} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() + if(EXTERNAL_LIBS) + target_link_libraries(${targetname} ${EXTERNAL_LIBS}) + endif() + if(EIGEN_TEST_CUSTOM_LINKER_FLAGS) + target_link_libraries(${targetname} ${EIGEN_TEST_CUSTOM_LINKER_FLAGS}) + endif() + + if(${ARGC} GREATER 3) + set(libs_to_link ${ARGV3}) + # it could be that some cmake module provides a bad library string " " (just spaces), + # and that severely breaks target_link_libraries ("can't link to -l-lstdc++" errors). + # so we check for strings containing only spaces. + string(STRIP "${libs_to_link}" libs_to_link_stripped) + string(LENGTH "${libs_to_link_stripped}" libs_to_link_stripped_length) + if(${libs_to_link_stripped_length} GREATER 0) + # notice: no double quotes around ${libs_to_link} here. It may be a list. + target_link_libraries(${targetname} ${libs_to_link}) + endif() + endif() + + add_test(${testname_with_suffix} "${targetname}") + + # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT + get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) + if ((current_subproject) AND (NOT (current_subproject STREQUAL ""))) + set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}") + add_dependencies("Build${current_subproject}" ${targetname}) + set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") + endif() + + +endmacro(ei_add_test_internal_sycl) + + # Macro to add a test # # the unique mandatory parameter testname must correspond to a file @@ -185,6 +282,39 @@ macro(ei_add_test testname) endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes) endmacro(ei_add_test) +macro(ei_add_test_sycl testname) + get_property(EIGEN_TESTS_LIST GLOBAL PROPERTY EIGEN_TESTS_LIST) + set(EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}${testname}\n") + set_property(GLOBAL PROPERTY EIGEN_TESTS_LIST "${EIGEN_TESTS_LIST}") + + if(EIGEN_ADD_TEST_FILENAME_EXTENSION) + set(filename ${testname}.${EIGEN_ADD_TEST_FILENAME_EXTENSION}) + else() + set(filename ${testname}.cpp) + endif() + + file(READ "${filename}" test_source) + set(parts 0) + string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" + occurences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + list(REMOVE_DUPLICATES suffixes) + if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) + add_custom_target(${testname}) + foreach(suffix ${suffixes}) + ei_add_test_internal_sycl(${testname} ${testname}_${suffix} + "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}") + add_dependencies(${testname} ${testname}_${suffix}) + endforeach(suffix) + else(EIGEN_SPLIT_LARGE_TESTS AND suffixes) + set(symbols_to_enable_all_parts "") + foreach(suffix ${suffixes}) + set(symbols_to_enable_all_parts + "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1") + endforeach(suffix) + ei_add_test_internal_sycl(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}") + endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes) +endmacro(ei_add_test_sycl) # adds a failtest, i.e. a test that succeed if the program fails to compile # note that the test runner for these is CMake itself, when passed -DEIGEN_FAILTEST=ON @@ -330,6 +460,11 @@ macro(ei_testing_print_summary) message(STATUS "C++11: OFF") endif() + if(EIGEN_TEST_SYCL) + message(STATUS "SYCL: ON") + else() + message(STATUS "SYCL: OFF") + endif() if(EIGEN_TEST_CUDA) if(EIGEN_TEST_CUDA_CLANG) message(STATUS "CUDA: ON (using clang)") diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake new file mode 100644 index 000000000..3aab5b833 --- /dev/null +++ b/cmake/FindComputeCpp.cmake @@ -0,0 +1,228 @@ +#.rst: +# FindComputeCpp +#--------------- + +######################### +# FindComputeCpp.cmake +######################### +# +# Tools for finding and building with ComputeCpp. +# + +# Require CMake version 3.2.2 or higher +cmake_minimum_required(VERSION 3.2.2) + +# Check that a supported host compiler can be found +if(CMAKE_COMPILER_IS_GNUCXX) + # Require at least gcc 4.8 + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) + message(FATAL_ERROR + "host compiler - Not found! (gcc version must be at least 4.8)") + # Require the GCC dual ABI to be disabled for 5.1 or higher + elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1) + set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") + message(STATUS + "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)") + else() + message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # Require at least clang 3.6 + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) + message(FATAL_ERROR + "host compiler - Not found! (clang version must be at least 3.6)") + else() + set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") + message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") + endif() +else() + message(WARNING + "host compiler - Not found! (ComputeCpp supports GCC and Clang, see readme)") +endif() + +set(COMPUTECPP_64_BIT_DEFAULT ON) +option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" + ${COMPUTECPP_64_BIT_DEFAULT}) +mark_as_advanced(COMPUTECPP_64_BIT_CODE) + +# Find OpenCL package +find_package(OpenCL REQUIRED) + +# Find ComputeCpp package +if(EXISTS ${COMPUTECPP_PACKAGE_ROOT_DIR}) + message(STATUS "ComputeCpp package - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})") +else() + message(FATAL_ERROR "ComputeCpp package - Not found! (please set COMPUTECPP_PACKAGE_ROOT_DIR) (${COMPUTECPP_PACKAGE_ROOT_DIR})") +endif() +option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package") + +# Obtain the path to compute++ +find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS + ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) +if (EXISTS ${COMPUTECPP_DEVICE_COMPILER}) + mark_as_advanced(COMPUTECPP_DEVICE_COMPILER) + message(STATUS "compute++ - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})") +else() + message(FATAL_ERROR "compute++ - Not found! (${COMPUTECPP_DEVICE_COMPILER}) (${COMPUTECPP_PACKAGE_ROOT_DIR})") +endif() + +# Obtain the path to computecpp_info +find_program(COMPUTECPP_INFO_TOOL computecpp_info PATHS + ${COMPUTECPP_PACKAGE_ROOT_DIR} PATH_SUFFIXES bin) +if (EXISTS ${COMPUTECPP_INFO_TOOL}) + mark_as_advanced(${COMPUTECPP_INFO_TOOL}) + message(STATUS "computecpp_info - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})") +else() + message(FATAL_ERROR "computecpp_info - Not found! (${COMPUTECPP_INFO_TOOL}) (${COMPUTECPP_PACKAGE_ROOT_DIR})") +endif() + +# Obtain the path to the ComputeCpp runtime library +find_library(COMPUTECPP_RUNTIME_LIBRARY ComputeCpp PATHS ${COMPUTECPP_PACKAGE_ROOT_DIR} + HINTS ${COMPUTECPP_PACKAGE_ROOT_DIR}/lib PATH_SUFFIXES lib + DOC "ComputeCpp Runtime Library" NO_DEFAULT_PATH) + +if (EXISTS ${COMPUTECPP_RUNTIME_LIBRARY}) + mark_as_advanced(COMPUTECPP_RUNTIME_LIBRARY) + message(STATUS "libComputeCpp.so - Found") +else() + message(FATAL_ERROR "libComputeCpp.so - Not found! (${COMPUTECPP_PACKAGE_ROOT_DIR})") +endif() + +# Obtain the ComputeCpp include directory +set(COMPUTECPP_INCLUDE_DIRECTORY ${COMPUTECPP_PACKAGE_ROOT_DIR}/include/) +if (NOT EXISTS ${COMPUTECPP_INCLUDE_DIRECTORY}) + message(FATAL_ERROR "ComputeCpp includes - Not found! (${COMPUTECPP_PACKAGE_ROOT_DIR}/include/)") +else() + message(STATUS "ComputeCpp includes - Found (${COMPUTECPP_PACKAGE_ROOT_DIR})") +endif() + +# Obtain the package version +execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-version" + OUTPUT_VARIABLE COMPUTECPP_PACKAGE_VERSION + RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") + message(FATAL_ERROR "Package version - Error obtaining version!") +else() + mark_as_advanced(COMPUTECPP_PACKAGE_VERSION) + message(STATUS "Package version - ${COMPUTECPP_PACKAGE_VERSION}") +endif() + +# Obtain the device compiler flags +execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-device-compiler-flags" + OUTPUT_VARIABLE COMPUTECPP_DEVICE_COMPILER_FLAGS + RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") + message(FATAL_ERROR "compute++ flags - Error obtaining compute++ flags!") +else() + mark_as_advanced(COMPUTECPP_COMPILER_FLAGS) + message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") +endif() + +set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) + +# Check if the platform is supported +execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" + OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED + RESULT_VARIABLE COMPUTECPP_INFO_TOOL_RESULT OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT COMPUTECPP_INFO_TOOL_RESULT EQUAL "0") + message(FATAL_ERROR "platform - Error checking platform support!") +else() + mark_as_advanced(COMPUTECPP_PLATFORM_IS_SUPPORTED) + if (COMPUTECPP_PLATFORM_IS_SUPPORTED) + message(STATUS "platform - your system can support ComputeCpp") + else() + message(STATUS "platform - your system CANNOT support ComputeCpp") + endif() +endif() + +#################### +# __build_sycl +#################### +# +# Adds a custom target for running compute++ and adding a dependency for the +# resulting integration header. +# +# targetName : Name of the target. +# sourceFile : Source file to be compiled. +# binaryDir : Intermediate output directory for the integration header. +# +function(__build_spir targetName sourceFile binaryDir) + + # Retrieve source file name. + get_filename_component(sourceFileName ${sourceFile} NAME) + + # Set the path to the Sycl file. + set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) + + # Add any user-defined include to the device compiler + get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY + INCLUDE_DIRECTORIES) + set(device_compiler_includes "") + foreach(directory ${includeDirectories}) + set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) + endforeach() + if (CMAKE_INCLUDE_PATH) + foreach(directory ${CMAKE_INCLUDE_PATH}) + set(device_compiler_includes "-I${directory}" + ${device_compiler_includes}) + endforeach() + endif() + + # Convert argument list format + separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) + + # Add custom command for running compute++ + add_custom_command( + OUTPUT ${outputSyclFile} + COMMAND ${COMPUTECPP_DEVICE_COMPILER} + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + -I${COMPUTECPP_INCLUDE_DIRECTORY} + ${COMPUTECPP_PLATFORM_SPECIFIC_ARGS} + ${device_compiler_includes} + -o ${outputSyclFile} + -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} + DEPENDS ${sourceFile} + COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") + + # Add a custom target for the generated integration header + add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) + + # Add a dependency on the integration header + add_dependencies(${targetName} ${targetName}_integration_header) + + # Force inclusion of the integration header for the host compiler + #set(compileFlags -include ${include_file} "-Wall") + target_compile_options(${targetName} PUBLIC ${compileFlags}) + + # Set the host compiler C++ standard to C++11 + set_property(TARGET ${targetName} PROPERTY CXX_STANDARD 11) + + # Disable GCC dual ABI on GCC 5.1 and higher + if(COMPUTECPP_DISABLE_GCC_DUAL_ABI) + set_property(TARGET ${targetName} APPEND PROPERTY COMPILE_DEFINITIONS + "_GLIBCXX_USE_CXX11_ABI=0") + endif() + +endfunction() + +####################### +# add_sycl_to_target +####################### +# +# Adds a SYCL compilation custom command associated with an existing +# target and sets a dependency on that new command. +# +# targetName : Name of the target to add a SYCL to. +# sourceFile : Source file to be compiled for SYCL. +# binaryDir : Intermediate output directory for the integration header. +# +function(add_sycl_to_target targetName sourceFile binaryDir) + + # Add custom target to run compute++ and generate the integration header + __build_spir(${targetName} ${sourceFile} ${binaryDir}) + + # Link with the ComputeCpp runtime library + target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} + PUBLIC ${OpenCL_LIBRARIES}) + +endfunction(add_sycl_to_target) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index 73aae3da8..388976d2e 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -69,6 +69,10 @@ typedef unsigned __int64 uint64_t; #endif #endif +#ifdef EIGEN_USE_SYCL +#include +#endif + #include "src/Tensor/TensorMacros.h" #include "src/Tensor/TensorForwardDeclarations.h" #include "src/Tensor/TensorMeta.h" @@ -77,6 +81,8 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" #include "src/Tensor/TensorDeviceCuda.h" +#include "src/Tensor/TensorSycl.h" +#include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" #include "src/Tensor/TensorDimensions.h" diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index cb615c75b..166be200c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -163,6 +163,11 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 7a45a5cf4..3c8710255 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -817,7 +817,7 @@ class TensorBase protected: template friend class Tensor; - template friend class TensorFixedSize; + template class MakePointer_> friend class TensorFixedSize; template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } @@ -833,7 +833,7 @@ class TensorBase : public TensorBase { static const int NumDimensions = DerivedTraits::NumDimensions; template friend class Tensor; - template friend class TensorFixedSize; + template class MakePointer_> friend class TensorFixedSize; template friend class TensorBase; EIGEN_DEVICE_FUNC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 5d67f69f3..4cfe300eb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -113,7 +113,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) + : m_broadcast(op.broadcast()),m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D @@ -374,7 +374,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } + protected: + const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 000000000..bfd36f5aa --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,122 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Cummins Chris PhD student at The University of Edinburgh. +// Contact: + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H + +namespace Eigen { +/// \struct BufferT is used to specialise add_sycl_buffer function for +// two types of buffer we have. When the MapAllocator is true, we create the +// sycl buffer with MapAllocator. +/// We have to const_cast the input pointer in order to work around the fact +/// that sycl does not accept map allocator for const pointer. +template +struct BufferT { + using Type = cl::sycl::buffer>; + static inline void add_sycl_buffer( + const T *ptr, size_t num_bytes, + std::map> &buffer_map) { + buffer_map.insert(std::pair>( + ptr, std::shared_ptr(std::make_shared( + Type(const_cast(ptr), cl::sycl::range<1>(num_bytes)))))); + } +}; + +/// specialisation of the \ref BufferT when the MapAllocator is false. In this +/// case we only create the device-only buffer. +template +struct BufferT { + using Type = cl::sycl::buffer; + static inline void add_sycl_buffer( + const T *ptr, size_t num_bytes, + std::map> &buffer_map) { + buffer_map.insert(std::pair>( + ptr, std::shared_ptr( + std::make_shared(Type(cl::sycl::range<1>(num_bytes)))))); + } +}; + +struct SyclDevice { + /// class members + /// sycl queue + cl::sycl::queue &m_queue; + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer + /// now depends on the lifespan of SyclDevice. If a non-read-only pointer is + /// needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + + SyclDevice(cl::sycl::queue &q) : m_queue(q) {} + // destructor + ~SyclDevice() { deallocate_all(); } + + template + void deallocate(const T *p) const { + auto it = buffer_map.find(p); + if (it != buffer_map.end()) { + buffer_map.erase(it); + } + } + void deallocate_all() const { buffer_map.clear(); } + + /// creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. + /// If found it gets the accessor from it, if not, the function then adds an + /// entry by creating a sycl buffer + /// for that particular pointer. + template + inline cl::sycl::accessor + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, + const T *ptr) const { + auto it = buffer_map.find(ptr); + if (it == buffer_map.end()) { + BufferT::add_sycl_buffer(ptr, num_bytes, buffer_map); + } + return ( + ((typename BufferT::Type *)(buffer_map.at(ptr).get())) + ->template get_access(cgh)); + } + + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + + // some runtime conditions that can be applied here + bool isDeviceSuitable() const { return true; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void *buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, + size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, + size_t n) const { + ::memset(buffer, c, n); + } +}; +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index a08dfa7c3..9d5cf1a8c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -20,8 +20,8 @@ namespace Eigen { * */ namespace internal { -template -struct traits > +template class MakePointer_> +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -36,16 +36,20 @@ struct traits > enum { Flags = 0 }; + template + struct MakePointer { + typedef typename MakePointer_::Type Type; + }; }; -template -struct eval, Eigen::Dense> +template class MakePointer_> +struct eval, Eigen::Dense> { typedef const TensorEvalToOp& type; }; -template -struct nested, 1, typename eval >::type> +template class MakePointer_> +struct nested, 1, typename eval >::type> { typedef TensorEvalToOp type; }; @@ -55,37 +59,38 @@ struct nested, 1, typename eval -template -class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename Eigen::NumTraits::Real RealScalar; typedef typename internal::remove_const::type CoeffReturnType; + typedef typename MakePointer_::Type PointerType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} EIGEN_DEVICE_FUNC const typename internal::remove_all::type& expression() const { return m_xpr; } - EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; } + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } protected: typename XprType::Nested m_xpr; - CoeffReturnType* m_buffer; + PointerType m_buffer; }; -template -struct TensorEvaluator, Device> +template class MakePointer_> +struct TensorEvaluator, Device> { - typedef TensorEvalToOp XprType; + typedef TensorEvalToOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -102,15 +107,22 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer()) + : m_impl(op.expression(), device), m_device(device), + m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) { } + // Used for accessor extraction in SYCL Managed TensorMap: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { + return m_op; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } + typedef typename internal::traits >::template MakePointer::Type DevicePointer; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* scalar) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); @@ -145,12 +157,20 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const Device& m_device; - CoeffReturnType* m_buffer; + DevicePointer m_buffer; + const XprType& m_op; + const ArgType m_expression; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index f0383c6bf..834ce07df 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -46,9 +46,11 @@ struct TensorEvaluator }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast(m.data())), m_dims(m.dimensions()), m_device(device) + : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) { } + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { @@ -106,12 +108,16 @@ struct TensorEvaluator internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// required by sycl in order to construct sycl buffer from raw pointer + const Device& device() const{return m_device;} protected: - Scalar* m_data; + typename internal::traits::template MakePointer::Type m_data; Dimensions m_dims; const Device& m_device; + const Derived& m_impl; }; namespace { @@ -159,8 +165,11 @@ struct TensorEvaluator RawAccess = true }; + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device) + : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } @@ -198,12 +207,16 @@ struct TensorEvaluator internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } + + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} protected: - const Scalar* m_data; + typename internal::traits::template MakePointer::Type m_data; Dimensions m_dims; const Device& m_device; + const Derived& m_impl; }; @@ -260,6 +273,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + NullaryOp functor() const { return m_functor; } + + private: const NullaryOp m_functor; TensorEvaluator m_argImpl; @@ -324,6 +343,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & impl() const { return m_argImpl; } + /// added for sycl in order to construct the buffer from sycl device + UnaryOp functor() const { return m_functor; } + + private: const UnaryOp m_functor; TensorEvaluator m_argImpl; @@ -397,6 +422,12 @@ struct TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + BinaryOp functor() const { return m_functor; } private: const BinaryOp m_functor; @@ -492,6 +523,13 @@ struct TensorEvaluator & arg1Impl() const { return m_arg1Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } + private: const TernaryOp m_functor; TensorEvaluator m_arg1Impl; @@ -576,6 +614,12 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & cond_impl() const { return m_condImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& then_impl() const { return m_thenImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& else_impl() const { return m_elseImpl; } private: TensorEvaluator m_condImpl; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 0cac7b179..f01d77c0a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -267,6 +267,20 @@ inline void TensorExecutor::run( #endif // __CUDACC__ #endif // EIGEN_USE_GPU +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +class TensorExecutor { +public: + static inline void run(const Expression &expr, const SyclDevice &device) { + // call TensorSYCL module + TensorSycl::run(expr, device); + } +}; + +#endif + } // end namespace internal } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index fcee5f60d..415e459b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -23,8 +23,8 @@ namespace Eigen { * Eigen::TensorFixedSize> t; */ -template -class TensorFixedSize : public TensorBase > +template class MakePointer_> +class TensorFixedSize : public TensorBase > { public: typedef TensorFixedSize Self; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index c23ecdbc4..9cf4a07e5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -19,9 +19,15 @@ namespace Eigen { * * */ +/// template class MakePointer_ is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler T* is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. +/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is T*. namespace internal { -template -struct traits > +template class MakePointer_> +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -36,26 +42,30 @@ struct traits > enum { Flags = 0 }; + template + struct MakePointer { + typedef typename MakePointer_::Type Type; + }; }; -template -struct eval, Eigen::Dense> +template class MakePointer_> +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template -struct nested, 1, typename eval >::type> +template class MakePointer_> +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -template -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +template class MakePointer_> +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -77,10 +87,10 @@ class TensorForcedEvalOp : public TensorBase, ReadOn }; -template -struct TensorEvaluator, Device> +template class MakePointer_> +struct TensorEvaluator, Device> { - typedef TensorForcedEvalOp XprType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -96,6 +106,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } @@ -110,10 +121,10 @@ struct TensorEvaluator, Device> new(m_buffer+i) CoeffReturnType(); } } - typedef TensorEvalToOp EvalTo; + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; EvalTo evalToTmp(m_buffer, m_op); const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); + internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -136,13 +147,17 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() { return m_impl; } + /// used by sycl in order to build the sycl buffer + const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const ArgType m_op; const Device& m_device; - CoeffReturnType* m_buffer; + typename MakePointer::Type m_buffer; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 490ddd8bd..83c690133 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -12,9 +12,19 @@ namespace Eigen { +// MakePointer class is used as a container of the adress space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template struct MakePointer{ + typedef T* Type; +}; + +template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; -template class TensorFixedSize; -template class TensorMap; +template class MakePointer_ = MakePointer> class TensorFixedSize; template class TensorRef; template class TensorBase; @@ -52,8 +62,8 @@ template class TensorScanOp; template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; -template class TensorEvalToOp; -template class TensorForcedEvalOp; +template class MakePointer_ = MakePointer> class TensorEvalToOp; +template class MakePointer_ = MakePointer> class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; @@ -61,6 +71,7 @@ template struct TensorEvaluator; struct DefaultDevice; struct ThreadPoolDevice; struct GpuDevice; +struct SyclDevice; enum FFTResultType { RealPart = 0, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 6fb4f4a31..a8e55757e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -18,11 +18,16 @@ namespace Eigen { * \brief A tensor expression mapping an existing array of data. * */ - -template class TensorMap : public TensorBase > +/// template class MakePointer_ is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler T* is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer T. +/// This is done through our MakePointer_ class. By default the Type in the MakePointer_ is T* . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is T*. +template class MakePointer_> class TensorMap : public TensorBase > { public: - typedef TensorMap Self; + typedef TensorMap Self; typedef typename PlainObjectType::Base Base; typedef typename Eigen::internal::nested::type Nested; typedef typename internal::traits::StorageKind StorageKind; @@ -36,7 +41,7 @@ template class TensorMap : public Tensor Scalar *, const Scalar *>::type PointerType;*/ - typedef Scalar* PointerType; + typedef typename MakePointer_::Type PointerType; typedef PointerType PointerArgType; static const int Options = Options_; @@ -109,9 +114,9 @@ template class TensorMap : public Tensor EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar* data() { return m_data; } + EIGEN_STRONG_INLINE PointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; } + EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const @@ -307,7 +312,7 @@ template class TensorMap : public Tensor } private: - Scalar* m_data; + typename MakePointer_::Type m_data; Dimensions m_dimensions; }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index a87777b22..d34ff98b0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -423,15 +423,15 @@ struct TensorEvaluator, Device> // Precompute output strides. if (NumOutputDims > 0) { if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } } else { - m_outputStrides.back() = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } } } @@ -439,27 +439,27 @@ struct TensorEvaluator, Device> if (NumInputDims > 0) { array input_strides; if (static_cast(Layout) == static_cast(ColMajor)) { - input_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_strides[i] = input_strides[i-1] * input_dims[i-1]; - } + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } } else { - input_strides.back() = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; - } + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } } int outputIndex = 0; int reduceIndex = 0; for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - m_reducedStrides[reduceIndex] = input_strides[i]; - ++reduceIndex; - } else { - m_preservedStrides[outputIndex] = input_strides[i]; - ++outputIndex; - } + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + ++outputIndex; + } } } @@ -578,7 +578,7 @@ struct TensorEvaluator, Device> Op reducer(m_reducer); if (ReducingInnerMostDims || RunningFullReduction) { const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; return internal::InnerMostDimReducer::reduce(*this, firstInput(index), num_values_to_reduce, reducer); } else { @@ -602,7 +602,7 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; if (ReducingInnerMostDims) { const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; const Index firstIndex = firstInput(index); for (Index i = 0; i < PacketSize; ++i) { Op reducer(m_reducer); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h new file mode 100644 index 000000000..da15f7942 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: eigen@codeplay.com +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// General include header of SYCL target for Tensor Module +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H + +#ifdef EIGEN_USE_SYCL + +// global pointer to set different attribute state for a class +template +struct MakeGlobalPointer { + typedef typename cl::sycl::global_ptr::pointer_t Type; +}; + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +/// This struct is used for special expression nodes with no operations (for example assign and selectOP). + struct NoOP; + +template struct GetType{ + typedef const T Type; +}; +template struct GetType{ + typedef T Type; +}; + +} +} +} + +// tuple construction +#include "TensorSyclTuple.h" + +// This file contains the PlaceHolder that replaces the actual data +#include "TensorSyclPlaceHolder.h" + +#include "TensorSyclLeafCount.h" + +// The index PlaceHolder takes the actual expression and replaces the actual +// data on it with the place holder. It uses the same pre-order expression tree +// traverse as the leaf count in order to give the right access number to each +// node in the expression +#include "TensorSyclPlaceHolderExpr.h" + +// creation of an accessor tuple from a tuple of SYCL buffers +#include "TensorSyclExtractAccessor.h" + +// actual data extraction using accessors +//#include "GetDeviceData.h" + +// this is used to change the address space type in tensor map for GPU +#include "TensorSyclConvertToDeviceExpression.h" + +// this is used to extract the functors +#include "TensorSyclExtractFunctors.h" + +// this is used to create tensormap on the device +// this is used to construct the expression on the device +#include "TensorSyclExprConstructor.h" + +// kernel execution using fusion +#include "TensorSyclRun.h" + +#endif // end of EIGEN_USE_SYCL +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h new file mode 100644 index 000000000..a94c30426 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -0,0 +1,109 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorSyclConvertToDeviceExpression.h + * + * \brief: + * Conversion from host pointer to device pointer + * inside leaf nodes of the expression. + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +/// \struct ConvertToDeviceExpression +/// \brief This struct is used to convert the MakePointer in the host expression +/// to the MakeGlobalPointer for the device expression. For the leafNodes +/// containing the pointer. This is due to the fact that the address space of +/// the pointer T* is different on the host and the device. +template +struct ConvertToDeviceExpression; + +template class NonOpCategory, bool IsConst, typename... Args> +struct NonOpConversion{ + typedef typename GetType::Type...> >::Type Type; +}; + + +template class > class NonOpCategory, bool IsConst, typename Args> +struct DeviceConvertor{ + typedef typename GetType::Type, MakeGlobalPointer> >::Type Type; +}; + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node +/// type is TensorMap +#define TENSORMAPCONVERT(CVQual)\ +template class MakePointer_>\ +struct ConvertToDeviceExpression, Options2_, MakePointer_> > {\ + typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ +}; + +TENSORMAPCONVERT(const) +TENSORMAPCONVERT() +#undef TENSORMAPCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node +/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp +#define CATEGORYCONVERT(CVQual)\ +template class Category, typename OP, typename... subExprs>\ +struct ConvertToDeviceExpression > {\ + typedef CVQual Category::Type... > Type;\ +}; +CATEGORYCONVERT(const) +CATEGORYCONVERT() +#undef CATEGORYCONVERT + + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node +/// type is TensorCwiseSelectOp +#define SELECTOPCONVERT(CVQual, Res)\ +template \ +struct ConvertToDeviceExpression >\ +: NonOpConversion {}; +SELECTOPCONVERT(const, true) +SELECTOPCONVERT(, false) +#undef SELECTOPCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node +/// type is const AssingOP +#define ASSIGNCONVERT(CVQual, Res)\ +template \ +struct ConvertToDeviceExpression >\ +: NonOpConversion{}; + +ASSIGNCONVERT(const, true) +ASSIGNCONVERT(, false) +#undef ASSIGNCONVERT + +/// specialisation of the \ref ConvertToDeviceExpression struct when the node +/// type is either TensorForcedEvalOp or TensorEvalToOp +#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\ +template \ +struct ConvertToDeviceExpression > \ +: DeviceConvertor{}; + +KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) +KERNELBROKERCONVERT(, false, TensorForcedEvalOp) +KERNELBROKERCONVERT(const, true, TensorEvalToOp) +KERNELBROKERCONVERT(, false, TensorEvalToOp) +#undef KERNELBROKERCONVERT +} // namespace internal +} // namespace TensorSycl +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX1 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h new file mode 100644 index 000000000..833d5e271 --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -0,0 +1,213 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorSyclExprConstructor.h + * + * \brief: + * This file re-create an expression on the SYCL device in order + * to use the original tensor evaluator. + * +*****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP + +namespace Eigen { +namespace TensorSycl { +namespace internal { +/// this class is used by EvalToOp in order to create an lhs expression which is +/// a pointer from an accessor on device-only buffer +template +struct EvalToLHSConstructor { + PtrType expr; + EvalToLHSConstructor(const utility::tuple::Tuple &t): expr((&(*(utility::tuple::get(t).get_pointer())))) {} +}; + +/// \struct ExprConstructor is used to reconstruct the expression on the device +/// and +/// recreate the expression with MakeGlobalPointer containing the device address +/// space for the TensorMap pointers used in eval function. +/// It receives the original expression type, the functor of the node, the tuple +/// of accessors, and the device expression type to re-instantiate the +/// expression tree for the device +template +struct ExprConstructor; + +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorMap +#define TENSORMAP(CVQual)\ +template class MakePointer_, size_t N, typename... Params>\ +struct ExprConstructor< CVQual TensorMap, Options2_, MakeGlobalPointer>,\ +CVQual Eigen::internal::PlaceHolder, Options3_, MakePointer_>, N>, Params...>{\ + typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ + : expr(Type((&(*(utility::tuple::get(t).get_pointer()))), fd.dimensions())) {}\ +}; + +TENSORMAP(const) +TENSORMAP() +#undef TENSORMAP + +#define UNARYCATEGORY(CVQual)\ +template class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ +struct ExprConstructor, CVQual UnaryCategory, Params...> {\ + typedef ExprConstructor my_type;\ + my_type rhsExpr;\ + typedef CVQual UnaryCategory Type;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\ +}; + +UNARYCATEGORY(const) +UNARYCATEGORY() +#undef UNARYCATEGORY + +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorBinaryOp +#define BINARYCATEGORY(CVQual)\ +template class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\ +typename RHSExpr, typename... Params>\ +struct ExprConstructor, CVQual BinaryCategory, Params...> {\ + typedef ExprConstructor my_left_type;\ + typedef ExprConstructor my_right_type;\ + typedef CVQual BinaryCategory Type;\ + my_left_type lhsExpr;\ + my_right_type rhsExpr;\ + Type expr;\ + template \ + ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ + : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\ +}; + +BINARYCATEGORY(const) +BINARYCATEGORY() +#undef BINARYCATEGORY + +/// specialisation of the \ref ExprConstructor struct when the node type is +/// TensorCwiseTernaryOp +#define TERNARYCATEGORY(CVQual)\ +template