Fix gpu conv3d out-of-resources failure.

This commit is contained in:
Antonio Sánchez 2023-02-28 21:25:00 +00:00 committed by Rasmus Munk Larsen
parent 62d5cfe835
commit ba7417f146

View File

@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
#endif
// Load inputs to shared memory
const size_t first_x = blockIdx.x * maxX;
const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const size_t num_x_input = last_x - first_x + kernelSizeX;
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + kernelSizeX;
const size_t first_y = blockIdx.y * maxY;
const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const size_t num_y_input = last_y - first_y + kernelSizeY;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + kernelSizeY;
const size_t first_z = blockIdx.z * maxZ;
const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const size_t num_z_input = last_z - first_z + kernelSizeZ;
const int first_z = blockIdx.z * maxZ;
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const int num_z_input = last_z - first_z + kernelSizeZ;
for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0;
for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) {
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
}
@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
__syncthreads();
// Convolution
const size_t num_z_output = last_z - first_z + 1;
const size_t num_y_output = last_y - first_y + 1;
const size_t num_x_output = last_x - first_x + 1;
const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) {
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
for (size_t n = 0; n < kernelSizeZ; ++n) {
for (size_t m = 0; m < kernelSizeY; ++m) {
for (size_t l = 0; l < kernelSizeX; ++l) {
for (int n = 0; n < kernelSizeZ; ++n) {
for (int m = 0; m < kernelSizeY; ++m) {
for (int l = 0; l < kernelSizeX; ++l) {
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
}
}