diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 158d250f0..e6e586b7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( #endif // Load inputs to shared memory - const size_t first_x = blockIdx.x * maxX; - const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const size_t num_x_input = last_x - first_x + kernelSizeX; + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; - const size_t first_y = blockIdx.y * maxY; - const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const size_t num_y_input = last_y - first_y + kernelSizeY; + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; - const size_t first_z = blockIdx.z * maxZ; - const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const size_t num_z_input = last_z - first_z + kernelSizeZ; + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; for (int p = 0; p < numPlanes; ++p) { const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; - for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) { - for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) { - for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) { + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } @@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( __syncthreads(); // Convolution - const size_t num_z_output = last_z - first_z + 1; - const size_t num_y_output = last_y - first_y + 1; - const size_t num_x_output = last_x - first_x + 1; + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); - for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) { - for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) { - for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) { + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { float result = 0.0f; - for (size_t n = 0; n < kernelSizeZ; ++n) { - for (size_t m = 0; m < kernelSizeY; ++m) { - for (size_t l = 0; l < kernelSizeX; ++l) { + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; } }