merging the CUDA and HIP implementation for the Tensor directory and the unit tests

This commit is contained in:
Deven Desai 2018-06-20 16:44:58 -04:00
parent cfdabbcc8f
commit 1bb6fa99a3
22 changed files with 1208 additions and 804 deletions

View File

@ -20,7 +20,8 @@ macro(ei_add_test_internal testname testname_with_suffix)
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
if(EIGEN_TEST_HIP) if(EIGEN_TEST_HIP)
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP") hip_reset_flags()
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
elseif(EIGEN_TEST_CUDA_CLANG) elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
if(CUDA_64_BIT_DEVICE_CODE) if(CUDA_64_BIT_DEVICE_CODE)

View File

@ -99,11 +99,7 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h" #include "src/Tensor/TensorDeviceThreadPool.h"
#if defined(EIGEN_USE_HIP) #include "src/Tensor/TensorDeviceGpu.h"
#include "src/Tensor/TensorDeviceHip.h"
#else
#include "src/Tensor/TensorDeviceCuda.h"
#endif
#include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h" #include "src/Tensor/TensorDimensionList.h"
@ -120,28 +116,16 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h" #include "src/Tensor/TensorReduction.h"
#if defined(EIGEN_USE_HIP) #include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorReductionHip.h"
#else
#include "src/Tensor/TensorReductionCuda.h"
#endif
#include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h" #include "src/Tensor/TensorContractionThreadPool.h"
#if defined(EIGEN_USE_HIP) #include "src/Tensor/TensorContractionGpu.h"
#include "src/Tensor/TensorContractionHip.h"
#else
#include "src/Tensor/TensorContractionCuda.h"
#endif
#include "src/Tensor/TensorConversion.h" #include "src/Tensor/TensorConversion.h"
#if defined(EIGEN_USE_HIP) #include "src/Tensor/TensorConvolution.h"
#include "src/Tensor/TensorConvolutionHip.h"
#else
#include "src/Tensor/TensorConvolution.h"
#endif
#include "src/Tensor/TensorFFT.h" #include "src/Tensor/TensorFFT.h"
#include "src/Tensor/TensorPatch.h" #include "src/Tensor/TensorPatch.h"
#include "src/Tensor/TensorImagePatch.h" #include "src/Tensor/TensorImagePatch.h"

View File

@ -9,10 +9,10 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
namespace Eigen { namespace Eigen {
@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y) // the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce // with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation. // the 8 threads over y by summation.
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
#else #else
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@ -503,7 +503,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
template<typename Scalar, typename Index, typename LhsMapper, template<typename Scalar, typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(512, 1)
#else
__launch_bounds__(512) __launch_bounds__(512)
#endif
EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -542,6 +546,44 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
results[i].x = results[i].y = results[i].z = results[i].w = 0; results[i].x = results[i].y = results[i].z = results[i].w = 0;
} }
#if defined(EIGEN_HIPCC)
#define prefetch_lhs(reg, row, col) \
if (!CHECK_LHS_BOUNDARY) { \
if (col < k_size) { \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
reg.z =lhs(row + 2, col); \
reg.w =lhs(row + 3, col); \
} \
} else { \
if (col < k_size) { \
if (row + 3 < m_size) { \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
reg.z =lhs(row + 2, col); \
reg.w =lhs(row + 3, col); \
} else if (row + 2 < m_size) { \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
reg.z =lhs(row + 2, col); \
} else if (row + 1 < m_size) { \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
} else if (row < m_size) { \
reg.x =lhs(row + 0, col); \
} \
} \
} \
#define prefetch_rhs_hipcc(reg, row, col) \
reg.x =rhs(row + 0, col); \
reg.y =rhs(row + 1, col); \
reg.z =rhs(row + 2, col); \
reg.w =rhs(row + 3, col); \
#else
#define prefetch_lhs(reg, row, col) \ #define prefetch_lhs(reg, row, col) \
if (!CHECK_LHS_BOUNDARY) { \ if (!CHECK_LHS_BOUNDARY) { \
@ -565,12 +607,19 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} \ } \
} \ } \
#endif
Index lhs_vert = base_m+threadIdx.x*4; Index lhs_vert = base_m+threadIdx.x*4;
for (Index k = 0; k < k_size; k += 16) { for (Index k = 0; k < k_size; k += 16) {
#if defined(EIGEN_HIPCC)
lhs_pf0 = make_float4(0, 0, 0, 0);
rhs_pf0 = make_float4(0, 0, 0, 0);
#else
lhs_pf0 = internal::pset1<float4>(0); lhs_pf0 = internal::pset1<float4>(0);
rhs_pf0 = internal::pset1<float4>(0); rhs_pf0 = internal::pset1<float4>(0);
#endif
Index lhs_horiz = threadIdx.y+k; Index lhs_horiz = threadIdx.y+k;
prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
@ -581,7 +630,11 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) { if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) { if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
#if defined(EIGEN_HIPCC)
prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0)
#else
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
#endif
} else if (rhs_vert + 2 < k_size) { } else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -596,7 +649,11 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else { } else {
if (rhs_horiz0 < n_size) { if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) { if ((rhs_vert + 3) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0)
#else
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
#endif
} else if ((rhs_vert + 2) < k_size) { } else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@ -618,7 +675,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x; x1 = rhs_pf0.x;
x2 = rhs_pf0.z; x2 = rhs_pf0.z;
} }
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
x1 = __shfl_xor(x1, 4); x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4); x2 = __shfl_xor(x2, 4);
#else #else
@ -696,6 +753,10 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
#undef prefetch_lhs #undef prefetch_lhs
#undef add_vals #undef add_vals
#if defined(EIGEN_HIPCC)
#undef prefetch_rhs_hipcc
#endif
Index horiz_base = threadIdx.y*4+base_n; Index horiz_base = threadIdx.y*4+base_n;
if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
@ -784,9 +845,33 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
results[i].x = results[i].y = results[i].z = results[i].w = 0; results[i].x = results[i].y = results[i].z = results[i].w = 0;
} }
#if defined(EIGEN_HIPCC)
#define prefetch_lhs_hipcc(reg, row, col) \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
reg.z =lhs(row + 2, col); \
reg.w =lhs(row + 3, col);
#define prefetch_rhs_hipcc(reg, row, col) \
reg.x =rhs(row + 0, col); \
reg.y =rhs(row + 1, col); \
reg.z =rhs(row + 2, col); \
reg.w =rhs(row + 3, col);
#endif
Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
for (Index k = 0; k < k_size; k += 32) { for (Index k = 0; k < k_size; k += 32) {
#if defined(EIGEN_HIPCC)
lhs_pf0 = make_float4(0, 0, 0, 0);
lhs_pf1 = make_float4(0, 0, 0, 0);
lhs_pf2 = make_float4(0, 0, 0, 0);
lhs_pf3 = make_float4(0, 0, 0, 0);
rhs_pf0 = make_float4(0, 0, 0, 0);
rhs_pf1 = make_float4(0, 0, 0, 0);
#else
lhs_pf0 = internal::pset1<float4>(0); lhs_pf0 = internal::pset1<float4>(0);
lhs_pf1 = internal::pset1<float4>(0); lhs_pf1 = internal::pset1<float4>(0);
lhs_pf2 = internal::pset1<float4>(0); lhs_pf2 = internal::pset1<float4>(0);
@ -794,40 +879,85 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
rhs_pf0 = internal::pset1<float4>(0); rhs_pf0 = internal::pset1<float4>(0);
rhs_pf1 = internal::pset1<float4>(0); rhs_pf1 = internal::pset1<float4>(0);
#endif
if (!CHECK_LHS_BOUNDARY) { if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) { if ((threadIdx.y/4+k+24) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16))
prefetch_lhs_hipcc(lhs_pf3, lhs_vert, (threadIdx.y/4+k+24))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
#endif
} else if ((threadIdx.y/4+k+16) < k_size) { } else if ((threadIdx.y/4+k+16) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
#endif
} else if ((threadIdx.y/4+k+8) < k_size) { } else if ((threadIdx.y/4+k+8) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
#endif
} else if ((threadIdx.y/4+k) < k_size) { } else if ((threadIdx.y/4+k) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
#endif
} }
} else { } else {
// just CHECK_LHS_BOUNDARY // just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) { if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) { if ((threadIdx.y/4+k+24) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16))
prefetch_lhs_hipcc(lhs_pf3, lhs_vert, (threadIdx.y/4+k+24))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
#endif
} else if ((threadIdx.y/4+k+16) < k_size) { } else if ((threadIdx.y/4+k+16) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
prefetch_lhs_hipcc(lhs_pf2, lhs_vert, (threadIdx.y/4+k+16))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
#endif
} else if ((threadIdx.y/4+k+8) < k_size) { } else if ((threadIdx.y/4+k+8) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
prefetch_lhs_hipcc(lhs_pf1, lhs_vert, (threadIdx.y/4+k+8))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
#endif
} else if ((threadIdx.y/4+k) < k_size) { } else if ((threadIdx.y/4+k) < k_size) {
#if defined(EIGEN_HIPCC)
prefetch_lhs_hipcc(lhs_pf0, lhs_vert, (threadIdx.y/4+k))
#else
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
#endif
} }
} else if (lhs_vert + 2 < m_size) { } else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) { if ((threadIdx.y/4+k+24) < k_size) {
@ -916,8 +1046,13 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) { if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) { if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
#if defined(EIGEN_HIPCC)
prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0)
prefetch_rhs_hipcc(rhs_pf1, rhs_vert, rhs_horiz1)
#else
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
#endif
} else if (rhs_vert + 2 < k_size) { } else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -939,8 +1074,13 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) { if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) { if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
#if defined(EIGEN_HIPCC)
prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0)
prefetch_rhs_hipcc(rhs_pf1, rhs_vert, rhs_horiz1)
#else
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
#endif
} else if (rhs_vert + 2 < k_size) { } else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -961,7 +1101,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) { } else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) { if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
#if defined(EIGEN_HIPCC)
prefetch_rhs_hipcc(rhs_pf0, rhs_vert, rhs_horiz0)
#else
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
#endif
} else if ((rhs_vert + 2) < k_size) { } else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY // just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -1069,6 +1213,10 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
__syncthreads(); __syncthreads();
} // end loop over k } // end loop over k
#if defined(EIGEN_HIPCC)
#undef prefetch_lhs_hipcc
#undef prefetch_rhs_hipcc
#endif
__syncthreads(); __syncthreads();
Index horiz_base = (threadIdx.y/4)*8+base_n; Index horiz_base = (threadIdx.y/4)*8+base_n;
@ -1134,7 +1282,11 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
template<typename Index, typename LhsMapper, template<typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(256, 1)
#else
__launch_bounds__(256) __launch_bounds__(256)
#endif
EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -1177,7 +1329,11 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
template<typename Index, typename LhsMapper, template<typename Index, typename LhsMapper,
typename RhsMapper, typename OutputMapper> typename RhsMapper, typename OutputMapper>
__global__ void __global__ void
#if defined(EIGEN_HIPCC)
__launch_bounds__(256, 1)
#else
__launch_bounds__(256) __launch_bounds__(256)
#endif
EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
const OutputMapper output, const OutputMapper output,
const Index m_size, const Index n_size, const Index k_size) { const Index m_size, const Index n_size, const Index k_size) {
@ -1323,7 +1479,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(8, 8, 8); const dim3 block_size(8, 8, 8);
LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} }
}; };
@ -1334,13 +1490,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(16, 16, 1); const dim3 block_size(16, 16, 1);
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} else { } else {
const Index m_blocks = (m + 127) / 128; const Index m_blocks = (m + 127) / 128;
const Index n_blocks = (n + 63) / 64; const Index n_blocks = (n + 63) / 64;
const dim3 num_blocks(m_blocks, n_blocks, 1); const dim3 num_blocks(m_blocks, n_blocks, 1);
const dim3 block_size(8, 32, 1); const dim3 block_size(8, 32, 1);
LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
} }
} }
}; };
@ -1384,12 +1540,17 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
OutputMapper output(buffer, m); OutputMapper output(buffer, m);
setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); #if defined(EIGEN_USE_HIP)
setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
#else
setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
#endif
LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device); LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
} }
}; };
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_USE_GPU and EIGEN_CUDACC #endif // EIGEN_USE_GPU and EIGEN_GPUCC
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H

View File

@ -54,8 +54,8 @@ class IndexMapper {
} }
} }
array<Index, NumDims> cudaInputDimensions; array<Index, NumDims> gpuInputDimensions;
array<Index, NumDims> cudaOutputDimensions; array<Index, NumDims> gpuOutputDimensions;
array<Index, NumDims> tmp = dimensions; array<Index, NumDims> tmp = dimensions;
array<Index, NumDims> ordering; array<Index, NumDims> ordering;
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@ -65,8 +65,8 @@ class IndexMapper {
const Index index = i + offset; const Index index = i + offset;
ordering[index] = indices[i]; ordering[index] = indices[i];
tmp[indices[i]] = -1; tmp[indices[i]] = -1;
cudaInputDimensions[index] = input_dims[indices[i]]; gpuInputDimensions[index] = input_dims[indices[i]];
cudaOutputDimensions[index] = dimensions[indices[i]]; gpuOutputDimensions[index] = dimensions[indices[i]];
} }
int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@ -75,8 +75,8 @@ class IndexMapper {
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
if (tmp[i] >= 0) { if (tmp[i] >= 0) {
ordering[written] = i; ordering[written] = i;
cudaInputDimensions[written] = input_dims[i]; gpuInputDimensions[written] = input_dims[i];
cudaOutputDimensions[written] = dimensions[i]; gpuOutputDimensions[written] = dimensions[i];
++written; ++written;
} }
} }
@ -89,37 +89,37 @@ class IndexMapper {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) { for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) { if (i > NumKernelDims) {
m_cudaInputStrides[i] = m_gpuInputStrides[i] =
m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
m_cudaOutputStrides[i] = m_gpuOutputStrides[i] =
m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
} else { } else {
m_cudaInputStrides[i] = 1; m_gpuInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1; m_gpuOutputStrides[i] = 1;
} }
} }
} else { } else {
for (int i = NumDims - 1; i >= 0; --i) { for (int i = NumDims - 1; i >= 0; --i) {
if (static_cast<size_t>(i + 1) < offset) { if (static_cast<size_t>(i + 1) < offset) {
m_cudaInputStrides[i] = m_gpuInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
m_cudaOutputStrides[i] = m_gpuOutputStrides[i] =
m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
} else { } else {
m_cudaInputStrides[i] = 1; m_gpuInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1; m_gpuOutputStrides[i] = 1;
} }
} }
} }
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
Index inputIndex = 0; Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) { for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaInputStrides[d]; const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d]; inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d]; p -= idx * m_gpuInputStrides[d];
} }
inputIndex += p * m_inputStrides[NumKernelDims]; inputIndex += p * m_inputStrides[NumKernelDims];
} else { } else {
@ -128,22 +128,22 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1; limit = NumDims - NumKernelDims - 1;
} }
for (int d = 0; d < limit; ++d) { for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaInputStrides[d]; const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d]; inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d]; p -= idx * m_gpuInputStrides[d];
} }
inputIndex += p * m_inputStrides[limit]; inputIndex += p * m_inputStrides[limit];
} }
return inputIndex; return inputIndex;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
Index outputIndex = 0; Index outputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) { for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaOutputStrides[d]; const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d]; outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d]; p -= idx * m_gpuOutputStrides[d];
} }
outputIndex += p * m_outputStrides[NumKernelDims]; outputIndex += p * m_outputStrides[NumKernelDims];
} else { } else {
@ -152,44 +152,44 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1; limit = NumDims - NumKernelDims - 1;
} }
for (int d = 0; d < limit; ++d) { for (int d = 0; d < limit; ++d) {
const Index idx = p / m_cudaOutputStrides[d]; const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d]; outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d]; p -= idx * m_gpuOutputStrides[d];
} }
outputIndex += p * m_outputStrides[limit]; outputIndex += p * m_outputStrides[limit];
} }
return outputIndex; return outputIndex;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_inputStrides[offset]; return i * m_inputStrides[offset];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_outputStrides[offset]; return i * m_outputStrides[offset];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
@ -197,7 +197,7 @@ class IndexMapper {
k * m_inputStrides[offset + 2]; k * m_inputStrides[offset + 2];
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0 ? 0
: NumDims - NumKernelDims; : NumDims - NumKernelDims;
@ -209,8 +209,8 @@ class IndexMapper {
static const int NumDims = internal::array_size<InputDims>::value; static const int NumDims = internal::array_size<InputDims>::value;
array<Index, NumDims> m_inputStrides; array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_cudaInputStrides; array<Index, NumDims> m_gpuInputStrides;
array<Index, NumDims> m_cudaOutputStrides; array<Index, NumDims> m_gpuOutputStrides;
}; };
@ -553,7 +553,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
// Use an optimized implementation of the evaluation code for GPUs whenever possible. // Use an optimized implementation of the evaluation code for GPUs whenever possible.
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
template <int StaticKernelSize> template <int StaticKernelSize>
struct GetKernelSize { struct GetKernelSize {
@ -576,7 +576,11 @@ __global__ void EigenConvolutionKernel1D(
indexMapper, indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) { const int maxX, const int kernelSize, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@ -588,18 +592,18 @@ __global__ void EigenConvolutionKernel1D(
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
// Load inputs to shared memory // Load inputs to shared memory
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.y * num_x_input; const int plane_kernel_offset = threadIdx.y * num_x_input;
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index); s[i + plane_kernel_offset] = eval.coeff(tensor_index);
} }
__syncthreads(); __syncthreads();
// Compute the convolution // Compute the convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@ -609,7 +613,7 @@ __global__ void EigenConvolutionKernel1D(
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
result += s[k + kernel_offset] * kernel[k]; result += s[k + kernel_offset] * kernel[k];
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
__syncthreads(); __syncthreads();
@ -625,7 +629,11 @@ __global__ void EigenConvolutionKernel2D(
const float* __restrict kernel, const int numPlanes, const int numX, const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int numY, const int maxY, const int kernelSizeX, const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) { const int kernelSizeY, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@ -642,7 +650,7 @@ __global__ void EigenConvolutionKernel2D(
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.z * num_y_input; const int plane_kernel_offset = threadIdx.z * num_y_input;
// Load inputs to shared memory // Load inputs to shared memory
@ -651,7 +659,7 @@ __global__ void EigenConvolutionKernel2D(
const int input_offset = num_x_input * (j + plane_kernel_offset); const int input_offset = num_x_input * (j + plane_kernel_offset);
#pragma unroll #pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
s[i + input_offset] = eval.coeff(tensor_index); s[i + input_offset] = eval.coeff(tensor_index);
} }
} }
@ -659,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
__syncthreads(); __syncthreads();
// Convolution // Convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll #pragma unroll
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@ -675,7 +683,7 @@ __global__ void EigenConvolutionKernel2D(
result += s[k + input_offset] * kernel[k + kernel_offset]; result += s[k + input_offset] * kernel[k + kernel_offset];
} }
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
} }
@ -693,7 +701,11 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) { const size_t kernelSizeZ, float* buffer) {
#if defined(EIGEN_HIPCC)
HIP_DYNAMIC_SHARED(float, s)
#else
extern __shared__ float s[]; extern __shared__ float s[];
#endif
// Load inputs to shared memory // Load inputs to shared memory
const int first_x = blockIdx.x * maxX; const int first_x = blockIdx.x * maxX;
@ -710,13 +722,13 @@ __global__ void EigenConvolutionKernel3D(
for (int p = 0; p < numPlanes; ++p) { for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0; const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
} }
} }
@ -728,7 +740,7 @@ __global__ void EigenConvolutionKernel3D(
const int num_z_output = last_z - first_z + 1; const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1; const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1; const int num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@ -741,7 +753,7 @@ __global__ void EigenConvolutionKernel3D(
} }
} }
} }
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
buffer[tensor_index] = result; buffer[tensor_index] = result;
} }
} }
@ -854,9 +866,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
const int maxSharedMem = m_device.sharedMemPerBlock(); const int maxSharedMem = m_device.sharedMemPerBlock();
const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
const int warpSize = 32; const int warpSize = 32;
switch (NumKernelDims) { switch (NumKernelDims) {
@ -908,15 +920,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_inputImpl.dimensions(), kernel_dims, indices); m_inputImpl.dimensions(), kernel_dims, indices);
switch(kernel_size) { switch(kernel_size) {
case 4: { case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
break; break;
} }
case 7: { case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
} }
} }
break; break;
@ -969,11 +981,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 4: { case 4: {
switch (kernel_size_y) { switch (kernel_size_y) {
case 7: { case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
break; break;
} }
} }
@ -982,18 +994,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 7: { case 7: {
switch (kernel_size_y) { switch (kernel_size_y) {
case 4: { case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
break; break;
} }
} }
break; break;
} }
default: { default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
break; break;
} }
} }
@ -1039,7 +1051,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices); m_inputImpl.dimensions(), kernel_dims, indices);
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break; break;
} }

View File

@ -7,21 +7,26 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
// This header file container defines fo gpu* macros which will resolve to
// their equivalent hip* or cuda* versions depending on the compiler in use
// A separte header (included at the end of this file) will undefine all
#include "TensorGpuHipCudaDefines.h"
namespace Eigen { namespace Eigen {
static const int kCudaScratchSize = 1024; static const int kGpuScratchSize = 1024;
// This defines an interface that GPUDevice can take to use // This defines an interface that GPUDevice can take to use
// CUDA streams underneath. // HIP / CUDA streams underneath.
class StreamInterface { class StreamInterface {
public: public:
virtual ~StreamInterface() {} virtual ~StreamInterface() {}
virtual const cudaStream_t& stream() const = 0; virtual const gpuStream_t& stream() const = 0;
virtual const cudaDeviceProp& deviceProperties() const = 0; virtual const gpuDeviceProp_t& deviceProperties() const = 0;
// Allocate memory on the actual device where the computation will run // Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0; virtual void* allocate(size_t num_bytes) const = 0;
@ -37,7 +42,7 @@ class StreamInterface {
virtual unsigned int* semaphore() const = 0; virtual unsigned int* semaphore() const = 0;
}; };
static cudaDeviceProp* m_deviceProperties; static gpuDeviceProp_t* m_deviceProperties;
static bool m_devicePropInitialized = false; static bool m_devicePropInitialized = false;
static void initializeDeviceProp() { static void initializeDeviceProp() {
@ -58,23 +63,23 @@ static void initializeDeviceProp() {
#endif #endif
// We're the first thread to reach this point. // We're the first thread to reach this point.
int num_devices; int num_devices;
cudaError_t status = cudaGetDeviceCount(&num_devices); gpuError_t status = gpuGetDeviceCount(&num_devices);
if (status != cudaSuccess) { if (status != gpuSuccess) {
std::cerr << "Failed to get the number of CUDA devices: " std::cerr << "Failed to get the number of GPU devices: "
<< cudaGetErrorString(status) << gpuGetErrorString(status)
<< std::endl; << std::endl;
assert(status == cudaSuccess); assert(status == gpuSuccess);
} }
m_deviceProperties = new cudaDeviceProp[num_devices]; m_deviceProperties = new gpuDeviceProp_t[num_devices];
for (int i = 0; i < num_devices; ++i) { for (int i = 0; i < num_devices; ++i) {
status = cudaGetDeviceProperties(&m_deviceProperties[i], i); status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
if (status != cudaSuccess) { if (status != gpuSuccess) {
std::cerr << "Failed to initialize CUDA device #" std::cerr << "Failed to initialize GPU device #"
<< i << i
<< ": " << ": "
<< cudaGetErrorString(status) << gpuGetErrorString(status)
<< std::endl; << std::endl;
assert(status == cudaSuccess); assert(status == gpuSuccess);
} }
} }
@ -94,87 +99,87 @@ static void initializeDeviceProp() {
} }
} }
static const cudaStream_t default_stream = cudaStreamDefault; static const gpuStream_t default_stream = gpuStreamDefault;
class CudaStreamDevice : public StreamInterface { class GpuStreamDevice : public StreamInterface {
public: public:
// Use the default stream on the current device // Use the default stream on the current device
CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
cudaGetDevice(&device_); gpuGetDevice(&device_);
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the default stream on the specified device // Use the default stream on the specified device
CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp(); initializeDeviceProp();
} }
// Use the specified stream. Note that it's the // Use the specified stream. Note that it's the
// caller responsibility to ensure that the stream can run on // caller responsibility to ensure that the stream can run on
// the specified device. If no device is specified the code // the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device. // assumes that the stream is associated to the current gpu device.
CudaStreamDevice(const cudaStream_t* stream, int device = -1) GpuStreamDevice(const gpuStream_t* stream, int device = -1)
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) { if (device < 0) {
cudaGetDevice(&device_); gpuGetDevice(&device_);
} else { } else {
int num_devices; int num_devices;
cudaError_t err = cudaGetDeviceCount(&num_devices); gpuError_t err = gpuGetDeviceCount(&num_devices);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(device < num_devices); assert(device < num_devices);
device_ = device; device_ = device;
} }
initializeDeviceProp(); initializeDeviceProp();
} }
virtual ~CudaStreamDevice() { virtual ~GpuStreamDevice() {
if (scratch_) { if (scratch_) {
deallocate(scratch_); deallocate(scratch_);
} }
} }
const cudaStream_t& stream() const { return *stream_; } const gpuStream_t& stream() const { return *stream_; }
const cudaDeviceProp& deviceProperties() const { const gpuDeviceProp_t& deviceProperties() const {
return m_deviceProperties[device_]; return m_deviceProperties[device_];
} }
virtual void* allocate(size_t num_bytes) const { virtual void* allocate(size_t num_bytes) const {
cudaError_t err = cudaSetDevice(device_); gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
void* result; void* result;
err = cudaMalloc(&result, num_bytes); err = gpuMalloc(&result, num_bytes);
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(result != NULL); assert(result != NULL);
return result; return result;
} }
virtual void deallocate(void* buffer) const { virtual void deallocate(void* buffer) const {
cudaError_t err = cudaSetDevice(device_); gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
assert(buffer != NULL); assert(buffer != NULL);
err = cudaFree(buffer); err = gpuFree(buffer);
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
virtual void* scratchpad() const { virtual void* scratchpad() const {
if (scratch_ == NULL) { if (scratch_ == NULL) {
scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
} }
return scratch_; return scratch_;
} }
virtual unsigned int* semaphore() const { virtual unsigned int* semaphore() const {
if (semaphore_ == NULL) { if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize; char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch); semaphore_ = reinterpret_cast<unsigned int*>(scratch);
cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
return semaphore_; return semaphore_;
} }
private: private:
const cudaStream_t* stream_; const gpuStream_t* stream_;
int device_; int device_;
mutable void* scratch_; mutable void* scratch_;
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
@ -190,7 +195,7 @@ struct GpuDevice {
eigen_assert(stream); eigen_assert(stream);
} }
// TODO(bsteiner): This is an internal API, we should not expose it. // TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const cudaStream_t& stream() const { EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
return stream_->stream(); return stream_->stream();
} }
@ -211,11 +216,11 @@ struct GpuDevice {
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
stream_->stream()); stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(dst); EIGEN_UNUSED_VARIABLE(dst);
EIGEN_UNUSED_VARIABLE(src); EIGEN_UNUSED_VARIABLE(src);
@ -225,24 +230,24 @@ struct GpuDevice {
} }
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
cudaError_t err = gpuError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
cudaError_t err = gpuError_t err =
cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err) EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess); assert(err == gpuSuccess);
#else #else
eigen_assert(false && "The default device should be used instead to generate kernel code"); eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
@ -260,31 +265,31 @@ struct GpuDevice {
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and // We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on cuda devices. // there is no l3 cache on hip/cuda devices.
return firstLevelCacheSize(); return firstLevelCacheSize();
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#if defined(EIGEN_CUDACC) && !defined(EIGEN_CUDA_ARCH) #if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
cudaError_t err = cudaStreamSynchronize(stream_->stream()); gpuError_t err = gpuStreamSynchronize(stream_->stream());
if (err != cudaSuccess) { if (err != gpuSuccess) {
std::cerr << "Error detected in CUDA stream: " std::cerr << "Error detected in GPU stream: "
<< cudaGetErrorString(err) << gpuGetErrorString(err)
<< std::endl; << std::endl;
assert(err == cudaSuccess); assert(err == gpuSuccess);
} }
#else #else
assert(false && "The default device should be used instead to generate kernel code"); assert(false && "The default device should be used instead to generate kernel code");
#endif #endif
} }
EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount; return stream_->deviceProperties().multiProcessorCount;
} }
EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock; return stream_->deviceProperties().maxThreadsPerBlock;
} }
EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor; return stream_->deviceProperties().maxThreadsPerMultiProcessor;
} }
EIGEN_STRONG_INLINE int sharedMemPerBlock() const { EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
@ -301,12 +306,12 @@ struct GpuDevice {
return max_blocks_; return max_blocks_;
} }
// This function checks if the CUDA runtime recorded an error for the // This function checks if the GPU runtime recorded an error for the
// underlying stream device. // underlying stream device.
inline bool ok() const { inline bool ok() const {
#ifdef EIGEN_CUDACC #ifdef EIGEN_GPUCC
cudaError_t error = cudaStreamQuery(stream_->stream()); gpuError_t error = gpuStreamQuery(stream_->stream());
return (error == cudaSuccess) || (error == cudaErrorNotReady); return (error == gpuSuccess) || (error == gpuErrorNotReady);
#else #else
return false; return false;
#endif #endif
@ -317,18 +322,27 @@ struct GpuDevice {
int max_blocks_; int max_blocks_;
}; };
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ #if defined(EIGEN_HIPCC)
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
assert(hipGetLastError() == hipSuccess);
#else
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess); assert(cudaGetLastError() == cudaSuccess);
#endif
// FIXME: Should be device and kernel specific. // FIXME: Should be device and kernel specific.
#ifdef EIGEN_CUDACC #ifdef EIGEN_GPUCC
static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
#ifndef EIGEN_CUDA_ARCH #ifndef EIGEN_GPU_COMPILE_PHASE
cudaError_t status = cudaDeviceSetSharedMemConfig(config); gpuError_t status = gpuDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status) EIGEN_UNUSED_VARIABLE(status)
assert(status == cudaSuccess); assert(status == gpuSuccess);
#else #else
EIGEN_UNUSED_VARIABLE(config) EIGEN_UNUSED_VARIABLE(config)
#endif #endif
@ -337,4 +351,7 @@ static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H // undefine all the gpu* macros we defined at the beginning of the file
#include "TensorGpuHipCudaUndefines.h"
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

View File

@ -250,28 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) { if (needs_assign) {
#if defined(EIGEN_HIPCC)
const int block_size = device.maxHipThreadsPerBlock(); const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks = device.getNumHipMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions()); const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), LAUNCH_GPU_KERNEL(
dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size);
#else
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size); num_blocks, block_size, 0, device, evaluator, size);
#endif
} }
evaluator.cleanup(); evaluator.cleanup();
} }

View File

@ -0,0 +1,86 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
// This will obsviously not work when trying to compile tensorflow on a sytem with no CUDA
// To work around this issue for HIP systems (and leave the default behaviour intact), the
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
#if defined(EIGEN_USE_HIP)
#define gpuStream_t hipStream_t
#define gpuDeviceProp_t hipDeviceProp_t
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#define gpuErrorNotReady hipErrorNotReady
#define gpuGetDeviceCount hipGetDeviceCount
#define gpuGetErrorString hipGetErrorString
#define gpuGetDeviceProperties hipGetDeviceProperties
// FIXME : use hipStreamDefault instead of 0x00
#define gpuStreamDefault 0x00
#define gpuGetDevice hipGetDevice
#define gpuSetDevice hipSetDevice
#define gpuMalloc hipMalloc
#define gpuFree hipFree
#define gpuMemsetAsync hipMemsetAsync
#define gpuMemcpyAsync hipMemcpyAsync
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuStreamQuery hipStreamQuery
#define gpuSharedMemConfig hipSharedMemConfig
#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
#define gpuStreamSynchronize hipStreamSynchronize
#define gpuMemcpy hipMemcpy
#else
#define gpuStream_t cudaStream_t
#define gpuDeviceProp_t cudaDeviceProp
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#define gpuErrorNotReady cudaErrorNotReady
#define gpuGetDeviceCount cudaGetDeviceCount
#define gpuGetErrorString cudaGetErrorString
#define gpuGetDeviceProperties cudaGetDeviceProperties
#define gpuStreamDefault cudaStreamDefault
#define gpuGetDevice cudaGetDevice
#define gpuSetDevice cudaSetDevice
#define gpuMalloc cudaMalloc
#define gpuFree cudaFree
#define gpuMemsetAsync cudaMemsetAsync
#define gpuMemcpyAsync cudaMemcpyAsync
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuStreamQuery cudaStreamQuery
#define gpuSharedMemConfig cudaSharedMemConfig
#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
#define gpuStreamSynchronize cudaStreamSynchronize
#define gpuMemcpy cudaMemcpy
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIPCC does not support the use of assert on the GPU side.
#undef assert
#define assert(COND)
#endif
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@ -0,0 +1,39 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#undef gpuStream_t
#undef gpuDeviceProp_t
#undef gpuError_t
#undef gpuSuccess
#undef gpuErrorNotReady
#undef gpuGetDeviceCount
#undef gpuGetErrorString
#undef gpuGetDeviceProperties
#undef gpuStreamDefault
#undef gpuGetDevice
#undef gpuSetDevice
#undef gpuMalloc
#undef gpuFree
#undef gpuMemsetAsync
#undef gpuMemcpyAsync
#undef gpuMemcpyDeviceToDevice
#undef gpuMemcpyDeviceToHost
#undef gpuMemcpyHostToDevice
#undef gpuStreamQuery
#undef gpuSharedMemConfig
#undef gpuDeviceSetSharedMemConfig
#undef gpuStreamSynchronize
#undef gpuMemcpy
#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@ -7,23 +7,23 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
// Full reducers for GPU, don't vectorize for now // Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple cuda thread to safely accumulate at the same // Reducer function that enables multiple gpu thread to safely accumulate at the same
// output address. It basically reads the current value of the output variable, and // output address. It basically reads the current value of the output variable, and
// attempts to update it with the new value. If in the meantime another cuda thread // attempts to update it with the new value. If in the meantime another gpu thread
// updated the content of the output address it will try again. // updated the content of the output address it will try again.
template <typename T, typename R> template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
if (sizeof(T) == 4) if (sizeof(T) == 4)
{ {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -79,7 +79,7 @@ __device__ inline double atomicExchCustom(double* address, double val) {
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <template <typename T> class R> template <template <typename T> class R>
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output); unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@ -98,11 +98,11 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
} }
} }
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <> template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
atomicAdd(output, accum); atomicAdd(output, accum);
#else // EIGEN_CUDA_ARCH >= 300 #else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device"); assert(0 && "Shouldn't be called on unsupported device");
@ -124,7 +124,7 @@ template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) { typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value // Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) { if (gridDim.x == 1) {
@ -168,7 +168,14 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// XXX use std::is_floating_point to determine the type of accum
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
}
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
#else #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
@ -182,6 +189,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
if (gridDim.x > 1 && threadIdx.x == 0) { if (gridDim.x > 1 && threadIdx.x == 0) {
// Let the last block reset the semaphore // Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1); atomicInc(semaphore, gridDim.x + 1);
#if defined(EIGEN_HIPCC)
__threadfence_system();
#endif
} }
#else // EIGEN_CUDA_ARCH >= 300 #else // EIGEN_CUDA_ARCH >= 300
assert(0 && "Shouldn't be called on unsupported device"); assert(0 && "Shouldn't be called on unsupported device");
@ -189,7 +199,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, template <typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
@ -227,6 +237,21 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel // Initialize the output value if it wasn't initialized by the ReductionInitKernel
#if defined(EIGEN_HIPCC)
if (gridDim.x == 1 && first_index == 0) {
if (num_coeffs % 2 != 0) {
half last = input.m_impl.coeff(num_coeffs-1);
*scratch = __halves2half2(last, reducer.initialize());
} else {
*scratch = reducer.template initializePacket<half2>();
}
__syncthreads();
}
#else
if (gridDim.x == 1) { if (gridDim.x == 1) {
if (first_index == 0) { if (first_index == 0) {
if (num_coeffs % 2 != 0) { if (num_coeffs % 2 != 0) {
@ -239,6 +264,8 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
__syncthreads(); __syncthreads();
} }
#endif
half2 accum = reducer.template initializePacket<half2>(); half2 accum = reducer.template initializePacket<half2>();
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
for (Index i = 0; i < max_iter; i += BlockSize) { for (Index i = 0; i < max_iter; i += BlockSize) {
@ -250,7 +277,13 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = accum;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &accum);
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
#else #else
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
@ -262,6 +295,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
atomicReduce(scratch, accum, reducer); atomicReduce(scratch, accum, reducer);
} }
#if defined(EIGEN_HIPCC)
__syncthreads();
if (gridDim.x == 1 && first_index == 0) {
half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), &tmp);
*output = tmp;
}
#else
if (gridDim.x == 1) { if (gridDim.x == 1) {
__syncthreads(); __syncthreads();
if (first_index == 0) { if (first_index == 0) {
@ -270,6 +314,8 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
*output = tmp; *output = tmp;
} }
} }
#endif
} }
template <typename Op> template <typename Op>
@ -280,7 +326,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
*output = tmp; *output = tmp;
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher { struct FullReductionLauncher {
@ -298,6 +344,7 @@ struct FullReductionLauncher<
internal::is_same<double, OutputType>::value, internal::is_same<double, OutputType>::value,
void>::type> { void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index; typedef typename Self::Index Index;
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 128; const int num_per_thread = 128;
@ -308,12 +355,12 @@ struct FullReductionLauncher<
semaphore = device.semaphore(); semaphore = device.semaphore();
} }
LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
} }
}; };
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, false> { struct FullReductionLauncher<Self, Op, Eigen::half, false> {
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@ -334,20 +381,20 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_coeffs, scratch); 1, 1, 0, device, reducer, self, num_coeffs, scratch);
} }
LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
if (num_blocks > 1) { if (num_blocks > 1) {
LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>), LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
1, 1, 0, device, reducer, output, scratch); 1, 1, 0, device, reducer, output, scratch);
} }
} }
}; };
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, bool Vectorizable> template <typename Self, typename Op, bool Vectorizable>
@ -355,16 +402,16 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex, // Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple cases // so reduce the scope of the optimized version of the code to the simple cases
// of doubles, floats and half floats // of doubles, floats and half floats
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value || internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_CUDA_FP16 #else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename OutputType> template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@ -384,7 +431,7 @@ template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) { typename Self::CoeffReturnType* output) {
#if EIGEN_CUDA_ARCH >= 300 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type; typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1); eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1); eigen_assert(blockDim.z == 1);
@ -437,7 +484,14 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// XXX use std::is_floating_point to determine the type of reduced_val
if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
#else #else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
@ -454,7 +508,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
#endif // EIGEN_CUDA_ARCH >= 300 #endif // EIGEN_CUDA_ARCH >= 300
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <int NumPerThread, typename Self, template <int NumPerThread, typename Self,
typename Reducer, typename Index> typename Reducer, typename Index>
@ -531,7 +585,18 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
#pragma unroll #pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) { for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #if defined(EIGEN_HIPCC)
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = reduced_val1;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &reduced_val1);
wka_in.h = reduced_val2;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &reduced_val2);
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
#else #else
@ -556,7 +621,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
} }
} }
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void> template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher { struct InnerReductionLauncher {
@ -581,30 +646,30 @@ struct InnerReductionLauncher<
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 128; const int num_per_thread = 128;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there // We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(), num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output); num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, false> { struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@ -627,28 +692,28 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
const int block_size = /*256*/128; const int block_size = /*256*/128;
const int num_per_thread = /*128*/64; const int num_per_thread = /*128*/64;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there // We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks. // won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_preserved_vals, output); 1, 1, 0, device, reducer, self, num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op> template <typename Self, typename Op>
@ -656,16 +721,16 @@ struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex, // Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case // so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats. // of floats and half floats.
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value || internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess)); (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_CUDA_FP16 #else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Op::IsStateful && static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_CUDA_FP16 #endif // EIGEN_HAS_GPU_FP16
template <typename OutputType> template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
@ -723,7 +788,20 @@ struct OuterReducer<Self, Op, GpuDevice> {
(internal::is_same<typename Self::CoeffReturnType, float>::value || (internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value); internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType> template <typename Device, typename OutputType>
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { static
#if !defined(EIGEN_HIPCC)
// FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
// (in the cxx11_tensor_reduction_gpu test)
//
// terminate called after throwing an instance of 'std::runtime_error'
// what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
//
// dont know why this happens (and why is it a runtime error instead of a compile time errror)
//
// this will be fixed by HIP PR#457
EIGEN_DEVICE_FUNC
#endif
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce doubles or floats on a gpu device"); assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true; return true;
} }
@ -740,33 +818,37 @@ struct OuterReducer<Self, Op, GpuDevice> {
const int block_size = 256; const int block_size = 256;
const int num_per_thread = 16; const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size; device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) { if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry // We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks. // about race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024); const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumCudaMultiProcessors() * const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / 1024; device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks); const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>), LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
num_blocks, 1024, 0, device, reducer.initialize(), num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output); num_preserved_vals, output);
} }
LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false; return false;
} }
}; };
#endif // defined(EIGEN_USE_GPU) && defined(__CUDACC__) #endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H #if defined(EIGEN_HIPCC)
#undef warpSize
#endif
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H

View File

@ -278,12 +278,8 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
Index total_size = internal::array_prod(self.dimensions()); Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64; Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64; Index block_size = 64;
#if defined(EIGEN_HIPCC)
hipLaunchKernelGGL(HIP_KERNEL_NAME(ScanKernel<Self, Reducer>), dim3(num_blocks), LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
dim3(block_size), 0, self.device().stream(), self, total_size, data);
#else
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
#endif
} }
}; };
#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) #endif // EIGEN_USE_GPU && (EIGEN_GPUCC)

View File

@ -274,24 +274,24 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(cxx11_tensor_complex_cuda) ei_add_test(cxx11_tensor_complex_gpu)
ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
ei_add_test(cxx11_tensor_reduction_cuda) ei_add_test(cxx11_tensor_reduction_gpu)
ei_add_test(cxx11_tensor_argmax_cuda) ei_add_test(cxx11_tensor_argmax_gpu)
ei_add_test(cxx11_tensor_cast_float16_cuda) ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_cuda) ei_add_test(cxx11_tensor_scan_gpu)
# Contractions require arch 3.0 or higher # Contractions require arch 3.0 or higher
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29) if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
ei_add_test(cxx11_tensor_device) ei_add_test(cxx11_tensor_device)
ei_add_test(cxx11_tensor_cuda) ei_add_test(cxx11_tensor_gpu)
ei_add_test(cxx11_tensor_contract_cuda) ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_of_float16_cuda) ei_add_test(cxx11_tensor_of_float16_gpu)
endif() endif()
# The random number generation code requires arch 3.5 or greater. # The random number generation code requires arch 3.5 or greater.
if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
ei_add_test(cxx11_tensor_random_cuda) ei_add_test(cxx11_tensor_random_gpu)
endif() endif()
@ -318,18 +318,23 @@ if (EIGEN_TEST_HIP)
include_directories(${HIP_PATH}/include) include_directories(${HIP_PATH}/include)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
#
# complex datatype is not yet supported by HIP
# so leaving out those tests for now
#
# ei_add_test(cxx11_tensor_complex_gpu)
# ei_add_test(cxx11_tensor_complex_cwise_ops_gpu)
#
ei_add_test(cxx11_tensor_reduction_gpu)
ei_add_test(cxx11_tensor_argmax_gpu)
ei_add_test(cxx11_tensor_cast_float16_gpu)
ei_add_test(cxx11_tensor_scan_gpu)
ei_add_test(cxx11_tensor_device)
# ei_add_test(cxx11_tensor_complex_hip) ei_add_test(cxx11_tensor_gpu)
# ei_add_test(cxx11_tensor_complex_cwise_ops_hip) ei_add_test(cxx11_tensor_contract_gpu)
ei_add_test(cxx11_tensor_reduction_hip) ei_add_test(cxx11_tensor_of_float16_gpu)
ei_add_test(cxx11_tensor_argmax_hip) ei_add_test(cxx11_tensor_random_gpu)
ei_add_test(cxx11_tensor_cast_float16_hip)
ei_add_test(cxx11_tensor_scan_hip)
ei_add_test(cxx11_tensor_device_hip)
ei_add_test(cxx11_tensor_hip)
ei_add_test(cxx11_tensor_contract_hip)
ei_add_test(cxx11_tensor_of_float16_hip)
ei_add_test(cxx11_tensor_random_hip)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)

View File

@ -9,16 +9,18 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_TEST_FUNC cxx11_tensor_gpu
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
template <int Layout> template <int Layout>
void test_cuda_simple_argmax() void test_gpu_simple_argmax()
{ {
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
@ -34,13 +36,13 @@ void test_cuda_simple_argmax()
double* d_in; double* d_in;
DenseIndex* d_out_max; DenseIndex* d_out_max;
DenseIndex* d_out_min; DenseIndex* d_out_min;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out_max), out_bytes); gpuMalloc((void**)(&d_out_max), out_bytes);
cudaMalloc((void**)(&d_out_min), out_bytes); gpuMalloc((void**)(&d_out_min), out_bytes);
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
@ -50,20 +52,20 @@ void test_cuda_simple_argmax()
gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_max.device(gpu_device) = gpu_in.argmax();
gpu_out_min.device(gpu_device) = gpu_in.argmin(); gpu_out_min.device(gpu_device) = gpu_in.argmin();
assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out_max); gpuFree(d_out_max);
cudaFree(d_out_min); gpuFree(d_out_min);
} }
template <int DataLayout> template <int DataLayout>
void test_cuda_argmax_dim() void test_gpu_argmax_dim()
{ {
Tensor<float, 4, DataLayout> tensor(2,3,5,7); Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims; std::vector<int> dims;
@ -97,12 +99,12 @@ void test_cuda_argmax_dim()
float* d_in; float* d_in;
DenseIndex* d_out; DenseIndex* d_out;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@ -110,8 +112,8 @@ void test_cuda_argmax_dim()
gpu_out.device(gpu_device) = gpu_in.argmax(dim); gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(), VERIFY_IS_EQUAL(tensor_arg.size(),
size_t(2*3*5*7 / tensor.dimension(dim))); size_t(2*3*5*7 / tensor.dimension(dim)));
@ -134,25 +136,25 @@ void test_cuda_argmax_dim()
} }
} }
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmax(dim); gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension // Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
} }
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out); gpuFree(d_out);
} }
} }
template <int DataLayout> template <int DataLayout>
void test_cuda_argmin_dim() void test_gpu_argmin_dim()
{ {
Tensor<float, 4, DataLayout> tensor(2,3,5,7); Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims; std::vector<int> dims;
@ -186,12 +188,12 @@ void test_cuda_argmin_dim()
float* d_in; float* d_in;
DenseIndex* d_out; DenseIndex* d_out;
cudaMalloc((void**)(&d_in), in_bytes); gpuMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
@ -199,8 +201,8 @@ void test_cuda_argmin_dim()
gpu_out.device(gpu_device) = gpu_in.argmin(dim); gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(), VERIFY_IS_EQUAL(tensor_arg.size(),
2*3*5*7 / tensor.dimension(dim)); 2*3*5*7 / tensor.dimension(dim));
@ -223,29 +225,29 @@ void test_cuda_argmin_dim()
} }
} }
cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmin(dim); gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension // Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
} }
cudaFree(d_in); gpuFree(d_in);
cudaFree(d_out); gpuFree(d_out);
} }
} }
void test_cxx11_tensor_cuda() void test_cxx11_tensor_gpu()
{ {
CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>());
CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>());
CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>());
CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>());
CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>());
CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>());
} }

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda #define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -18,8 +18,8 @@
using Eigen::Tensor; using Eigen::Tensor;
void test_cuda_conversion() { void test_gpu_conversion() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -72,8 +72,8 @@ void test_fallback_conversion() {
} }
void test_cxx11_tensor_cast_float16_cuda() void test_cxx11_tensor_cast_float16_gpu()
{ {
CALL_SUBTEST(test_cuda_conversion()); CALL_SUBTEST(test_gpu_conversion());
CALL_SUBTEST(test_fallback_conversion()); CALL_SUBTEST(test_fallback_conversion());
} }

View File

@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() {
cudaMalloc((void**)(&d_in2), complex_bytes); cudaMalloc((void**)(&d_in2), complex_bytes);
cudaMalloc((void**)(&d_out), complex_bytes); cudaMalloc((void**)(&d_out), complex_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1( Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(

View File

@ -34,7 +34,7 @@ void test_cuda_nullary() {
cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
@ -70,7 +70,7 @@ void test_cuda_nullary() {
static void test_cuda_sum_reductions() { static void test_cuda_sum_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -106,7 +106,7 @@ static void test_cuda_sum_reductions() {
static void test_cuda_mean_reductions() { static void test_cuda_mean_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -142,7 +142,7 @@ static void test_cuda_mean_reductions() {
static void test_cuda_product_reductions() { static void test_cuda_product_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);

View File

@ -10,19 +10,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda #define EIGEN_TEST_FUNC cxx11_tensor_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair; typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction(int m_size, int k_size, int n_size) void test_gpu_contraction(int m_size, int k_size, int n_size)
{ {
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is // with these dimensions, the output has 300 * 140 elements, which is
@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
float* d_t_right; float* d_t_right;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes); gpuMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes); gpuMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims); t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) { for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue; continue;
@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_left); gpuFree((void*)d_t_left);
cudaFree((void*)d_t_right); gpuFree((void*)d_t_right);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
float* d_t_right; float* d_t_right;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes); gpuMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes); gpuMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims); t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
if (fabs(t_result() - t_result_gpu()) > 1e-4f && if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result() std::cout << "mismatch detected: " << t_result()
@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_left); gpuFree((void*)d_t_left);
cudaFree((void*)d_t_right); gpuFree((void*)d_t_right);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_m() { void test_gpu_contraction_m() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(k, 128, 128); test_gpu_contraction<ColMajor>(k, 128, 128);
test_cuda_contraction<RowMajor>(k, 128, 128); test_gpu_contraction<RowMajor>(k, 128, 128);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_k() { void test_gpu_contraction_k() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(128, k, 128); test_gpu_contraction<ColMajor>(128, k, 128);
test_cuda_contraction<RowMajor>(128, k, 128); test_gpu_contraction<RowMajor>(128, k, 128);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_n() { void test_gpu_contraction_n() {
for (int k = 32; k < 256; k++) { for (int k = 32; k < 256; k++) {
test_cuda_contraction<ColMajor>(128, 128, k); test_gpu_contraction<ColMajor>(128, 128, k);
test_cuda_contraction<RowMajor>(128, 128, k); test_gpu_contraction<RowMajor>(128, 128, k);
} }
} }
template<int DataLayout> template<int DataLayout>
void test_cuda_contraction_sizes() { void test_gpu_contraction_sizes() {
int m_sizes[] = { 31, 39, 63, 64, 65, int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511, 127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025}; 512, 513, 1023, 1024, 1025};
@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() {
for (int i = 0; i < 15; i++) { for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) { for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) { for (int k = 0; k < 17; k++) {
test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
} }
} }
} }
} }
void test_cxx11_tensor_cuda() void test_cxx11_tensor_gpu()
{ {
CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); #if !defined(EIGEN_USE_HIP)
CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); // disable these subtests for HIP
CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
#endif
} }

View File

@ -16,6 +16,7 @@
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
using Eigen::RowMajor; using Eigen::RowMajor;
@ -66,22 +67,22 @@ struct CPUContext {
// Context for evaluation on GPU // Context for evaluation on GPU
struct GPUContext { struct GPUContext {
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess);
float kernel_1d_val[] = {3.14f, 2.7f}; float kernel_1d_val[] = {3.14f, 2.7f};
assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess);
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess);
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess);
} }
~GPUContext() { ~GPUContext() {
assert(cudaFree(kernel_1d_) == cudaSuccess); assert(gpuFree(kernel_1d_) == gpuSuccess);
assert(cudaFree(kernel_2d_) == cudaSuccess); assert(gpuFree(kernel_2d_) == gpuSuccess);
assert(cudaFree(kernel_3d_) == cudaSuccess); assert(gpuFree(kernel_3d_) == gpuSuccess);
} }
const Eigen::GpuDevice& device() const { return gpu_device_; } const Eigen::GpuDevice& device() const { return gpu_device_; }
@ -102,7 +103,7 @@ struct GPUContext {
float* kernel_2d_; float* kernel_2d_;
float* kernel_3d_; float* kernel_3d_;
Eigen::CudaStreamDevice stream_; Eigen::GpuStreamDevice stream_;
Eigen::GpuDevice gpu_device_; Eigen::GpuDevice gpu_device_;
}; };
@ -281,12 +282,12 @@ void test_gpu() {
float* d_in1; float* d_in1;
float* d_in2; float* d_in2;
float* d_out; float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes); gpuMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_in2), in2_bytes); gpuMalloc((void**)(&d_in2), in2_bytes);
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
@ -294,7 +295,7 @@ void test_gpu() {
GPUContext context(gpu_in1, gpu_in2, gpu_out); GPUContext context(gpu_in1, gpu_in2, gpu_out);
test_contextual_eval(&context); test_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -304,7 +305,7 @@ void test_gpu() {
} }
test_forced_contextual_eval(&context); test_forced_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -314,7 +315,7 @@ void test_gpu() {
} }
test_compound_assignment(&context); test_compound_assignment(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) { for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -324,7 +325,7 @@ void test_gpu() {
} }
test_contraction(&context); test_contraction(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) { for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0); const float result = out(i,j,0);
@ -339,8 +340,8 @@ void test_gpu() {
} }
test_1d_convolution(&context); test_1d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) { for (int k = 0; k < 70; ++k) {
@ -350,8 +351,8 @@ void test_gpu() {
} }
test_2d_convolution(&context); test_2d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 40; ++i) { for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) { for (int k = 0; k < 69; ++k) {
@ -363,9 +364,13 @@ void test_gpu() {
} }
} }
#if !defined(EIGEN_USE_HIP)
// disable this test on the HIP platform
// 3D tensor convolutions seem to hang on the HIP platform
test_3d_convolution(&context); test_3d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess);
for (int i = 0; i < 39; ++i) { for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) { for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) { for (int k = 0; k < 69; ++k) {
@ -378,6 +383,9 @@ void test_gpu() {
} }
} }
} }
#endif
} }

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda #define EIGEN_TEST_FUNC cxx11_tensor_of_float16_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -20,8 +20,8 @@
using Eigen::Tensor; using Eigen::Tensor;
template<typename> template<typename>
void test_cuda_numext() { void test_gpu_numext() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -57,11 +57,11 @@ void test_cuda_numext() {
} }
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
template<typename> template<typename>
void test_cuda_conversion() { void test_gpu_conversion() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -95,8 +95,8 @@ void test_cuda_conversion() {
} }
template<typename> template<typename>
void test_cuda_unary() { void test_gpu_unary() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -132,8 +132,8 @@ void test_cuda_unary() {
} }
template<typename> template<typename>
void test_cuda_elementwise() { void test_gpu_elementwise() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -174,8 +174,8 @@ void test_cuda_elementwise() {
} }
template<typename> template<typename>
void test_cuda_trancendental() { void test_gpu_trancendental() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -268,8 +268,8 @@ void test_cuda_trancendental() {
} }
template<typename> template<typename>
void test_cuda_contractions() { void test_gpu_contractions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int rows = 23; int rows = 23;
int cols = 23; int cols = 23;
@ -319,12 +319,12 @@ void test_cuda_contractions() {
} }
template<typename> template<typename>
void test_cuda_reductions(int size1, int size2, int redux) { void test_gpu_reductions(int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2 std::cout << "Reducing " << size1 << " by " << size2
<< " tensor along dim " << redux << std::endl; << " tensor along dim " << redux << std::endl;
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = size1*size2; int num_elem = size1*size2;
int result_size = (redux == 1 ? size1 : size2); int result_size = (redux == 1 ? size1 : size2);
@ -368,20 +368,20 @@ void test_cuda_reductions(int size1, int size2, int redux) {
} }
template<typename> template<typename>
void test_cuda_reductions() { void test_gpu_reductions() {
test_cuda_reductions<void>(13, 13, 0); test_gpu_reductions<void>(13, 13, 0);
test_cuda_reductions<void>(13, 13, 1); test_gpu_reductions<void>(13, 13, 1);
test_cuda_reductions<void>(35, 36, 0); test_gpu_reductions<void>(35, 36, 0);
test_cuda_reductions<void>(35, 36, 1); test_gpu_reductions<void>(35, 36, 1);
test_cuda_reductions<void>(36, 35, 0); test_gpu_reductions<void>(36, 35, 0);
test_cuda_reductions<void>(36, 35, 1); test_gpu_reductions<void>(36, 35, 1);
} }
template<typename> template<typename>
void test_cuda_full_reductions() { void test_gpu_full_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int size = 13; int size = 13;
int num_elem = size*size; int num_elem = size*size;
@ -429,9 +429,9 @@ void test_cuda_full_reductions() {
} }
template<typename> template<typename>
void test_cuda_forced_evals() { void test_gpu_forced_evals() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101; int num_elem = 101;
@ -479,20 +479,20 @@ void test_cuda_forced_evals() {
#endif #endif
void test_cxx11_tensor_of_float16_cuda() void test_cxx11_tensor_of_float16_gpu()
{ {
CALL_SUBTEST_1(test_cuda_numext<void>()); CALL_SUBTEST_1(test_gpu_numext<void>());
#ifdef EIGEN_HAS_CUDA_FP16 #ifdef EIGEN_HAS_GPU_FP16
CALL_SUBTEST_1(test_cuda_conversion<void>()); CALL_SUBTEST_1(test_gpu_conversion<void>());
CALL_SUBTEST_1(test_cuda_unary<void>()); CALL_SUBTEST_1(test_gpu_unary<void>());
CALL_SUBTEST_1(test_cuda_elementwise<void>()); CALL_SUBTEST_1(test_gpu_elementwise<void>());
CALL_SUBTEST_1(test_cuda_trancendental<void>()); CALL_SUBTEST_1(test_gpu_trancendental<void>());
CALL_SUBTEST_2(test_cuda_contractions<void>()); CALL_SUBTEST_2(test_gpu_contractions<void>());
CALL_SUBTEST_3(test_cuda_reductions<void>()); CALL_SUBTEST_3(test_gpu_reductions<void>());
CALL_SUBTEST_4(test_cuda_full_reductions<void>()); CALL_SUBTEST_4(test_gpu_full_reductions<void>());
CALL_SUBTEST_5(test_cuda_forced_evals<void>()); CALL_SUBTEST_5(test_gpu_forced_evals<void>());
#else #else
std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
#endif #endif
} }

View File

@ -9,15 +9,16 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda #define EIGEN_TEST_FUNC cxx11_tensor_random_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <Eigen/CXX11/Tensor> #include <Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
void test_cuda_random_uniform() void test_gpu_random_uniform()
{ {
Tensor<float, 2> out(72,97); Tensor<float, 2> out(72,97);
out.setZero(); out.setZero();
@ -25,24 +26,24 @@ void test_cuda_random_uniform()
std::size_t out_bytes = out.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float);
float* d_out; float* d_out;
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
gpu_out.device(gpu_device) = gpu_out.random(); gpu_out.device(gpu_device) = gpu_out.random();
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
// For now we just check this code doesn't crash. // For now we just check this code doesn't crash.
// TODO: come up with a valid test of randomness // TODO: come up with a valid test of randomness
} }
void test_cuda_random_normal() void test_gpu_random_normal()
{ {
Tensor<float, 2> out(72,97); Tensor<float, 2> out(72,97);
out.setZero(); out.setZero();
@ -50,9 +51,9 @@ void test_cuda_random_normal()
std::size_t out_bytes = out.size() * sizeof(float); std::size_t out_bytes = out.size() * sizeof(float);
float* d_out; float* d_out;
cudaMalloc((void**)(&d_out), out_bytes); gpuMalloc((void**)(&d_out), out_bytes);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
@ -60,8 +61,8 @@ void test_cuda_random_normal()
Eigen::internal::NormalRandomGenerator<float> gen(true); Eigen::internal::NormalRandomGenerator<float> gen(true);
gpu_out.device(gpu_device) = gpu_out.random(gen); gpu_out.device(gpu_device) = gpu_out.random(gen);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
} }
static void test_complex() static void test_complex()
@ -77,9 +78,9 @@ static void test_complex()
} }
void test_cxx11_tensor_random_cuda() void test_cxx11_tensor_random_gpu()
{ {
CALL_SUBTEST(test_cuda_random_uniform()); CALL_SUBTEST(test_gpu_random_uniform());
CALL_SUBTEST(test_cuda_random_normal()); CALL_SUBTEST(test_gpu_random_normal());
CALL_SUBTEST(test_complex()); CALL_SUBTEST(test_complex());
} }

View File

@ -9,7 +9,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda #define EIGEN_TEST_FUNC cxx11_tensor_reduction_gpu
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
@ -19,7 +19,7 @@
template<typename Type, int DataLayout> template<typename Type, int DataLayout>
static void test_full_reductions() { static void test_full_reductions() {
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024); const int num_rows = internal::random<int>(1024, 5*1024);
@ -67,7 +67,7 @@ static void test_first_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device // Create device
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream); Eigen::GpuDevice dev(&stream);
// Create data(T) // Create data(T)
@ -107,7 +107,7 @@ static void test_last_dim_reductions() {
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device // Create device
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice dev(&stream); Eigen::GpuDevice dev(&stream);
// Create data // Create data
@ -134,7 +134,7 @@ static void test_last_dim_reductions() {
} }
void test_cxx11_tensor_reduction_cuda() { void test_cxx11_tensor_reduction_gpu() {
CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));

View File

@ -9,19 +9,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda #define EIGEN_TEST_FUNC cxx11_tensor_scan_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <unsupported/Eigen/CXX11/Tensor> #include <unsupported/Eigen/CXX11/Tensor>
#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor; using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair; typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout> template<int DataLayout>
void test_cuda_cumsum(int m_size, int k_size, int n_size) void test_gpu_cumsum(int m_size, int k_size, int n_size)
{ {
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
@ -36,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
float* d_t_input; float* d_t_input;
float* d_t_result; float* d_t_result;
cudaMalloc((void**)(&d_t_input), t_input_bytes); gpuMalloc((void**)(&d_t_input), t_input_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes); gpuMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice);
Eigen::CudaStreamDevice stream; Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream); Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
@ -52,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
t_result = t_input.cumsum(1); t_result = t_input.cumsum(1);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) { for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue; continue;
@ -65,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size)
assert(false); assert(false);
} }
cudaFree((void*)d_t_input); gpuFree((void*)d_t_input);
cudaFree((void*)d_t_result); gpuFree((void*)d_t_result);
} }
void test_cxx11_tensor_scan_cuda() void test_cxx11_tensor_scan_gpu()
{ {
CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128));
CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128));
} }