mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-22 21:04:28 +08:00
Introduce gpu_assert for assertion in device-code, and disable them with clang-cuda.
This commit is contained in:
parent
5fd03ddbfb
commit
06eb24cf4d
@ -903,7 +903,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
|
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
|
||||||
assert(shared_mem <= maxSharedMem);
|
gpu_assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
const int num_x_blocks = ceil(numX, maxX);
|
const int num_x_blocks = ceil(numX, maxX);
|
||||||
const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
|
const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
|
||||||
@ -960,7 +960,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
|
block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
|
||||||
|
|
||||||
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
|
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
|
||||||
assert(shared_mem <= maxSharedMem);
|
gpu_assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
const int num_x_blocks = ceil(numX, maxX);
|
const int num_x_blocks = ceil(numX, maxX);
|
||||||
const int num_y_blocks = ceil(numY, maxY);
|
const int num_y_blocks = ceil(numY, maxY);
|
||||||
@ -1040,7 +1040,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
|
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
|
||||||
|
|
||||||
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
|
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
|
||||||
assert(shared_mem <= maxSharedMem);
|
gpu_assert(shared_mem <= maxSharedMem);
|
||||||
|
|
||||||
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
|
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
|
||||||
const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
|
const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
|
||||||
|
@ -352,7 +352,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
|
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
|
||||||
m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
|
m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
|
||||||
const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
|
const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
|
||||||
assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
||||||
auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
|
auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
|
||||||
auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
|
auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
|
||||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||||
@ -377,7 +377,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
||||||
m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
||||||
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
|
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
|
||||||
assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
||||||
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
||||||
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
||||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||||
@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
|
|||||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
||||||
m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
||||||
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
|
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
|
||||||
assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
gpu_assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
|
||||||
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
||||||
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
||||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||||
|
@ -68,7 +68,7 @@ static void initializeDeviceProp() {
|
|||||||
std::cerr << "Failed to get the number of GPU devices: "
|
std::cerr << "Failed to get the number of GPU devices: "
|
||||||
<< gpuGetErrorString(status)
|
<< gpuGetErrorString(status)
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
assert(status == gpuSuccess);
|
gpu_assert(status == gpuSuccess);
|
||||||
}
|
}
|
||||||
m_deviceProperties = new gpuDeviceProp_t[num_devices];
|
m_deviceProperties = new gpuDeviceProp_t[num_devices];
|
||||||
for (int i = 0; i < num_devices; ++i) {
|
for (int i = 0; i < num_devices; ++i) {
|
||||||
@ -79,7 +79,7 @@ static void initializeDeviceProp() {
|
|||||||
<< ": "
|
<< ": "
|
||||||
<< gpuGetErrorString(status)
|
<< gpuGetErrorString(status)
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
assert(status == gpuSuccess);
|
gpu_assert(status == gpuSuccess);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,8 +124,8 @@ class GpuStreamDevice : public StreamInterface {
|
|||||||
int num_devices;
|
int num_devices;
|
||||||
gpuError_t err = gpuGetDeviceCount(&num_devices);
|
gpuError_t err = gpuGetDeviceCount(&num_devices);
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
assert(device < num_devices);
|
gpu_assert(device < num_devices);
|
||||||
device_ = device;
|
device_ = device;
|
||||||
}
|
}
|
||||||
initializeDeviceProp();
|
initializeDeviceProp();
|
||||||
@ -144,20 +144,20 @@ class GpuStreamDevice : public StreamInterface {
|
|||||||
virtual void* allocate(size_t num_bytes) const {
|
virtual void* allocate(size_t num_bytes) const {
|
||||||
gpuError_t err = gpuSetDevice(device_);
|
gpuError_t err = gpuSetDevice(device_);
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
void* result;
|
void* result;
|
||||||
err = gpuMalloc(&result, num_bytes);
|
err = gpuMalloc(&result, num_bytes);
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
assert(result != NULL);
|
gpu_assert(result != NULL);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
virtual void deallocate(void* buffer) const {
|
virtual void deallocate(void* buffer) const {
|
||||||
gpuError_t err = gpuSetDevice(device_);
|
gpuError_t err = gpuSetDevice(device_);
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
assert(buffer != NULL);
|
gpu_assert(buffer != NULL);
|
||||||
err = gpuFree(buffer);
|
err = gpuFree(buffer);
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void* scratchpad() const {
|
virtual void* scratchpad() const {
|
||||||
@ -173,7 +173,7 @@ class GpuStreamDevice : public StreamInterface {
|
|||||||
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
|
||||||
gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
}
|
}
|
||||||
return semaphore_;
|
return semaphore_;
|
||||||
}
|
}
|
||||||
@ -220,7 +220,7 @@ struct GpuDevice {
|
|||||||
gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
|
gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
|
||||||
stream_->stream());
|
stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
#else
|
#else
|
||||||
EIGEN_UNUSED_VARIABLE(dst);
|
EIGEN_UNUSED_VARIABLE(dst);
|
||||||
EIGEN_UNUSED_VARIABLE(src);
|
EIGEN_UNUSED_VARIABLE(src);
|
||||||
@ -233,21 +233,21 @@ struct GpuDevice {
|
|||||||
gpuError_t err =
|
gpuError_t err =
|
||||||
gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
|
gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
|
||||||
gpuError_t err =
|
gpuError_t err =
|
||||||
gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
|
gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
}
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
|
||||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||||
gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
|
gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
|
||||||
EIGEN_UNUSED_VARIABLE(err)
|
EIGEN_UNUSED_VARIABLE(err)
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
#else
|
#else
|
||||||
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
eigen_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
#endif
|
#endif
|
||||||
@ -276,10 +276,10 @@ struct GpuDevice {
|
|||||||
std::cerr << "Error detected in GPU stream: "
|
std::cerr << "Error detected in GPU stream: "
|
||||||
<< gpuGetErrorString(err)
|
<< gpuGetErrorString(err)
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
assert(err == gpuSuccess);
|
gpu_assert(err == gpuSuccess);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(false && "The default device should be used instead to generate kernel code");
|
gpu_assert(false && "The default device should be used instead to generate kernel code");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -326,13 +326,13 @@ struct GpuDevice {
|
|||||||
|
|
||||||
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||||
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
|
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
|
||||||
assert(hipGetLastError() == hipSuccess);
|
gpu_assert(hipGetLastError() == hipSuccess);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
|
||||||
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
|
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
|
||||||
assert(cudaGetLastError() == cudaSuccess);
|
gpu_assert(cudaGetLastError() == cudaSuccess);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -342,7 +342,7 @@ static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig co
|
|||||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||||
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
|
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
|
||||||
EIGEN_UNUSED_VARIABLE(status)
|
EIGEN_UNUSED_VARIABLE(status)
|
||||||
assert(status == gpuSuccess);
|
gpu_assert(status == gpuSuccess);
|
||||||
#else
|
#else
|
||||||
EIGEN_UNUSED_VARIABLE(config)
|
EIGEN_UNUSED_VARIABLE(config)
|
||||||
#endif
|
#endif
|
||||||
|
@ -78,10 +78,11 @@
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0))
|
||||||
// HIPCC does not support the use of assert on the GPU side.
|
// clang-cuda and HIPCC do not support the use of assert on the GPU side.
|
||||||
#undef assert
|
#define gpu_assert(COND)
|
||||||
#define assert(COND)
|
#else
|
||||||
|
#define gpu_assert(COND) assert(COND)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
|
||||||
|
@ -19,7 +19,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
|
|||||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||||
// We don't support 3d kernels since we currently only use 1 and
|
// We don't support 3d kernels since we currently only use 1 and
|
||||||
// 2d kernels.
|
// 2d kernels.
|
||||||
assert(threadIdx.z == 0);
|
gpu_assert(threadIdx.z == 0);
|
||||||
return clock64() +
|
return clock64() +
|
||||||
blockIdx.x * blockDim.x + threadIdx.x +
|
blockIdx.x * blockDim.x + threadIdx.x +
|
||||||
gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
|
gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
|
||||||
|
@ -60,10 +60,10 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
assert(0 && "Wordsize not supported");
|
gpu_assert(0 && "Wordsize not supported");
|
||||||
}
|
}
|
||||||
#else // EIGEN_CUDA_ARCH >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // EIGEN_CUDA_ARCH >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
|
|||||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||||
atomicAdd(output, accum);
|
atomicAdd(output, accum);
|
||||||
#else // EIGEN_CUDA_ARCH >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // EIGEN_CUDA_ARCH >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,7 +196,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else // EIGEN_CUDA_ARCH >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // EIGEN_CUDA_ARCH >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -304,7 +304,7 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
|
|||||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||||
struct FullReductionLauncher {
|
struct FullReductionLauncher {
|
||||||
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
|
||||||
assert(false && "Should only be called on doubles, floats and half floats");
|
gpu_assert(false && "Should only be called on doubles, floats and half floats");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -337,7 +337,7 @@ struct FullReductionLauncher<
|
|||||||
template <typename Self, typename Op>
|
template <typename Self, typename Op>
|
||||||
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
|
||||||
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
|
||||||
assert(false && "Should not be called since there is no packet accessor");
|
gpu_assert(false && "Should not be called since there is no packet accessor");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -388,7 +388,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
|
|||||||
|
|
||||||
template <typename OutputType>
|
template <typename OutputType>
|
||||||
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
|
||||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||||
// Don't crash when we're called with an input tensor of size 0.
|
// Don't crash when we're called with an input tensor of size 0.
|
||||||
if (num_coeffs == 0) {
|
if (num_coeffs == 0) {
|
||||||
@ -479,7 +479,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else // EIGEN_CUDA_ARCH >= 300
|
#else // EIGEN_CUDA_ARCH >= 300
|
||||||
assert(0 && "Shouldn't be called on unsupported device");
|
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||||
#endif // EIGEN_CUDA_ARCH >= 300
|
#endif // EIGEN_CUDA_ARCH >= 300
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -601,7 +601,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
|||||||
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
|
||||||
struct InnerReductionLauncher {
|
struct InnerReductionLauncher {
|
||||||
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
|
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||||
assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
|
gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -648,7 +648,7 @@ struct InnerReductionLauncher<
|
|||||||
template <typename Self, typename Op>
|
template <typename Self, typename Op>
|
||||||
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
|
||||||
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
|
||||||
assert(false && "Should not be called since there is no packet accessor");
|
gpu_assert(false && "Should not be called since there is no packet accessor");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -709,7 +709,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
|
|||||||
|
|
||||||
template <typename OutputType>
|
template <typename OutputType>
|
||||||
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
|
||||||
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
|
||||||
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
const Index num_coeffs = array_prod(self.m_impl.dimensions());
|
||||||
// Don't crash when we're called with an input tensor of size 0.
|
// Don't crash when we're called with an input tensor of size 0.
|
||||||
if (num_coeffs == 0) {
|
if (num_coeffs == 0) {
|
||||||
@ -777,7 +777,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
|||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
#endif
|
#endif
|
||||||
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
|
||||||
assert(false && "Should only be called to reduce doubles or floats on a gpu device");
|
gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user