From cfdabbcc8f708c06da2bfa4e924edc25619f013a Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 12:57:02 -0400 Subject: [PATCH] removing the *Hip files from the unsupported/Eigen/CXX11/src/Tensor and unsupported/test directories --- .../CXX11/src/Tensor/TensorContractionHip.h | 1521 ----------------- .../CXX11/src/Tensor/TensorConvolutionHip.h | 1119 ------------ .../Eigen/CXX11/src/Tensor/TensorDeviceHip.h | 352 ---- .../CXX11/src/Tensor/TensorReductionHip.h | 815 --------- unsupported/test/cxx11_tensor_argmax_hip.cu | 251 --- .../test/cxx11_tensor_cast_float16_hip.cu | 79 - unsupported/test/cxx11_tensor_contract_hip.cu | 215 --- unsupported/test/cxx11_tensor_device_hip.cu | 389 ----- unsupported/test/cxx11_tensor_hip.cu | 1295 -------------- .../test/cxx11_tensor_of_float16_hip.cu | 498 ------ unsupported/test/cxx11_tensor_random_hip.cu | 85 - .../test/cxx11_tensor_reduction_hip.cu | 154 -- unsupported/test/cxx11_tensor_scan_hip.cu | 76 - 13 files changed, 6849 deletions(-) delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h delete mode 100644 unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h delete mode 100644 unsupported/test/cxx11_tensor_argmax_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_cast_float16_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_contract_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_device_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_of_float16_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_random_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_reduction_hip.cu delete mode 100644 unsupported/test/cxx11_tensor_scan_hip.cu diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h deleted file mode 100644 index 7561846a3..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionHip.h +++ /dev/null @@ -1,1521 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H - -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = hipBlockIdx_x; - const Index n_block_idx = hipBlockIdx_y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in hipThreadIdx_x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_x * 9 + hipThreadIdx_z; - const Index rhs_store_idx_base = hipThreadIdx_y * 72 + hipThreadIdx_z * 8 + hipThreadIdx_x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // hipThreadIdx_x: the vertical position in an 8x8 block - // hipThreadIdx_y: the vertical index of the 8x8 block in the grid - // hipThreadIdx_z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = hipThreadIdx_x + 8 * hipThreadIdx_y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + hipThreadIdx_z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + hipThreadIdx_z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + hipThreadIdx_z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + hipThreadIdx_z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + hipThreadIdx_z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + hipThreadIdx_z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + hipThreadIdx_z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + hipThreadIdx_z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + hipThreadIdx_z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + hipThreadIdx_z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + hipThreadIdx_z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + hipThreadIdx_z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + hipThreadIdx_z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + hipThreadIdx_z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + hipThreadIdx_z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + hipThreadIdx_z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[hipThreadIdx_x + 9 * hipThreadIdx_y]; - const Scalar* rhs_block = &rhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (hipThreadIdx_x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - hipThreadIdx_y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - hipThreadIdx_z + 7) / 8), 8); - - if (hipThreadIdx_x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 0]; - Scalar val1 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 1]; - Scalar val2 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 2]; - Scalar val3 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 3]; - Scalar val4 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 4]; - Scalar val5 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 5]; - Scalar val6 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 6]; - Scalar val7 = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * 7]; - - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 0) = val0; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 1) = val1; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 2) = val2; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 3) = val3; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 4) = val4; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 5) = val5; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 6) = val6; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[hipThreadIdx_x + 8 * hipThreadIdx_y + 64 * hipThreadIdx_z + 512 * j]; - output(base_m + hipThreadIdx_y + 8 * hipThreadIdx_x, base_n + hipThreadIdx_z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -__launch_bounds__(512, 1) -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = hipBlockIdx_x; - const Index n_block_idx = hipBlockIdx_y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - /*reg = lhs.template loadPacket(row, col);*/ \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - reg.w =lhs(row + 3, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - /*reg =lhs.template loadPacket(row, col);*/ \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - reg.w =lhs(row + 3, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - - Index lhs_vert = base_m+hipThreadIdx_x*4; - - for (Index k = 0; k < k_size; k += 16) { - //lhs_pf0 = internal::pset1(0); - //rhs_pf0 = internal::pset1(0); - lhs_pf0 = make_float4(0, 0, 0, 0); - rhs_pf0 = make_float4(0, 0, 0, 0); - - Index lhs_horiz = hipThreadIdx_y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(hipThreadIdx_x%4)*4; - Index rhs_horiz0 = (hipThreadIdx_x>>2)+hipThreadIdx_y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((hipThreadIdx_x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - if((hipThreadIdx_x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2][hipThreadIdx_x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(hipThreadIdx_x>>3)+ hipThreadIdx_y*2+32][hipThreadIdx_x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[hipThreadIdx_y+16][hipThreadIdx_x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][hipThreadIdx_x]; - float2 fl2 = lhs_shmem2[koff + 16][hipThreadIdx_x]; - - int start_feature = hipThreadIdx_y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = hipThreadIdx_y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - - Index lhs_vert = base_m+hipThreadIdx_x*4+(hipThreadIdx_y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - /*lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0);*/ - - - lhs_pf0 = make_float4(0, 0, 0, 0); - lhs_pf1 = make_float4(0, 0, 0, 0); - lhs_pf2 = make_float4(0, 0, 0, 0); - lhs_pf3 = make_float4(0, 0, 0, 0); - - rhs_pf0 = make_float4(0, 0, 0, 0); - rhs_pf1 = make_float4(0, 0, 0, 0); - - if (!CHECK_LHS_BOUNDARY) { - if ((hipThreadIdx_y/4+k+24) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); - //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); - lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); - } else if ((hipThreadIdx_y/4+k+16) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - } else if ((hipThreadIdx_y/4+k+8) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - } else if ((hipThreadIdx_y/4+k) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((hipThreadIdx_y/4+k+24) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); - //lhs_pf3 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+24)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); - lhs_pf3.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); - } else if ((hipThreadIdx_y/4+k+16) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - //lhs_pf2 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+16)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf2.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - } else if ((hipThreadIdx_y/4+k+8) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - //lhs_pf1 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k+8)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf1.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - } else if ((hipThreadIdx_y/4+k) < k_size) { - //lhs_pf0 =lhs.template loadPacket(lhs_vert, (hipThreadIdx_y/4+k)); - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf0.w =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((hipThreadIdx_y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+24)); - } else if ((hipThreadIdx_y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+16)); - } else if ((hipThreadIdx_y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k+8)); - } else if ((hipThreadIdx_y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (hipThreadIdx_y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((hipThreadIdx_y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+24)); - } else if ((hipThreadIdx_y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+16)); - } else if ((hipThreadIdx_y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k+8)); - } else if ((hipThreadIdx_y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (hipThreadIdx_y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((hipThreadIdx_y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+24)); - } else if ((hipThreadIdx_y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+16)); - } else if ((hipThreadIdx_y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k+8)); - } else if ((hipThreadIdx_y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (hipThreadIdx_y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+hipThreadIdx_x*4; - Index rhs_horiz0 = hipThreadIdx_y*2+base_n; - Index rhs_horiz1 = hipThreadIdx_y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - //rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - rhs_pf1.w = rhs(rhs_vert + 3, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+hipThreadIdx_x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+hipThreadIdx_x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - //rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf0.w = rhs(rhs_vert + 3, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[hipThreadIdx_y][hipThreadIdx_x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[hipThreadIdx_y+32][hipThreadIdx_x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[hipThreadIdx_y+64][hipThreadIdx_x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[hipThreadIdx_y+96][hipThreadIdx_x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[hipThreadIdx_y/4][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[hipThreadIdx_y/4+8][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[hipThreadIdx_y/4+16][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[hipThreadIdx_y/4+24][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[hipThreadIdx_y/4 + 32][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[hipThreadIdx_y/4 + 40][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[hipThreadIdx_y/4 + 48][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[hipThreadIdx_y/4 + 56][hipThreadIdx_x+(hipThreadIdx_y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][hipThreadIdx_x + (hipThreadIdx_y % 4) * 8]; - - // first feature is at (hipThreadIdx_y/4) * 8 last is at start + 8. - int start_feature = (hipThreadIdx_y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - - __syncthreads(); - Index horiz_base = (hipThreadIdx_y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -__launch_bounds__(256, 1) -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - - const Index m_block_idx = hipBlockIdx_x; - const Index n_block_idx = hipBlockIdx_y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -__launch_bounds__(256, 1) -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = hipBlockIdx_x; - const Index n_block_idx = hipBlockIdx_y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} - - // We need to redefine this method to make hipcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenContractionKernel), - dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel16x16), - dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenFloatContractionKernel), - dim3(num_blocks), dim3(block_size), 0, device.stream(), lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - setHipSharedMemConfig(hipSharedMemBankSizeEightByte); - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and EIGEN_HIPCC -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h deleted file mode 100644 index ba9971050..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionHip.h +++ /dev/null @@ -1,1119 +0,0 @@ -//#include "hip/hip_runtime.h" -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -namespace internal { - -template -class IndexMapper { - public: - IndexMapper(const InputDims& input_dims, const array& kernel_dims, - const array& indices) { - - array dimensions = input_dims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = indices[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - dimensions[index] = result_dim; - } - - array inputStrides; - array outputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; - outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } - } else { - inputStrides[NumDims - 1] = 1; - outputStrides[NumDims - 1] = 1; - for (int i = static_cast(NumDims) - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; - outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; - } - } - - array hipInputDimensions; - array hipOutputDimensions; - array tmp = dimensions; - array ordering; - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = i + offset; - ordering[index] = indices[i]; - tmp[indices[i]] = -1; - hipInputDimensions[index] = input_dims[indices[i]]; - hipOutputDimensions[index] = dimensions[indices[i]]; - } - - int written = static_cast(Layout) == static_cast(ColMajor) - ? NumKernelDims - : 0; - for (int i = 0; i < NumDims; ++i) { - if (tmp[i] >= 0) { - ordering[written] = i; - hipInputDimensions[written] = input_dims[i]; - hipOutputDimensions[written] = dimensions[i]; - ++written; - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[ordering[i]]; - m_outputStrides[i] = outputStrides[ordering[i]]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_hipInputStrides[i] = - m_hipInputStrides[i - 1] * hipInputDimensions[i - 1]; - m_hipOutputStrides[i] = - m_hipOutputStrides[i - 1] * hipOutputDimensions[i - 1]; - } else { - m_hipInputStrides[i] = 1; - m_hipOutputStrides[i] = 1; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { - m_hipInputStrides[i] = - m_hipInputStrides[i + 1] * hipInputDimensions[i + 1]; - m_hipOutputStrides[i] = - m_hipOutputStrides[i + 1] * hipOutputDimensions[i + 1]; - } else { - m_hipInputStrides[i] = 1; - m_hipOutputStrides[i] = 1; - } - } - } - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputPlaneToTensorInputOffset(Index p) const { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_hipInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_hipInputStrides[d]; - } - inputIndex += p * m_inputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_hipInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_hipInputStrides[d]; - } - inputIndex += p * m_inputStrides[limit]; - } - return inputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputPlaneToTensorOutputOffset(Index p) const { - Index outputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_hipOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_hipOutputStrides[d]; - } - outputIndex += p * m_outputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_hipOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_hipOutputStrides[d]; - } - outputIndex += p * m_outputStrides[limit]; - } - return outputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + - k * m_inputStrides[offset + 2]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapHipOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + - k * m_outputStrides[offset + 2]; - } - - private: - static const int NumDims = internal::array_size::value; - array m_inputStrides; - array m_outputStrides; - array m_hipInputStrides; - array m_hipOutputStrides; -}; - - - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type::ret Scalar; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename InputXprType::Nested LhsNested; - typedef typename KernelXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConvolutionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConvolutionOp type; -}; - -} // end namespace internal - - - -template -class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) - : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - inputExpression() const { return m_input_xpr; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - kernelExpression() const { return m_kernel_xpr; } - - protected: - typename InputXprType::Nested m_input_xpr; - typename KernelXprType::Nested m_kernel_xpr; - const Indices m_indices; -}; - - -template -struct TensorEvaluator, Device> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; - } - } else { - m_inputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; - } - } - - m_dimensions = m_inputImpl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; - } else { - m_kernelStride[0] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; - } - } else { - for (int i = NumKernelDims - 1; i >= 0; --i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i < NumKernelDims - 1) { - m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; - } else { - m_kernelStride[NumKernelDims - 1] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_inputImpl.evalSubExprsIfNeeded(NULL); - preloadKernel(); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - void evalTo(typename XprType::Scalar* buffer) { - evalSubExprsIfNeeded(NULL); - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - CoeffReturnType result = CoeffReturnType(0); - convolve(firstInput(index), 0, NumKernelDims-1, result); - return result; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const - { - Index indices[2] = {index, index+PacketSize-1}; - Index startInputs[2] = {0, 0}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } - startInputs[0] += indices[0]; - startInputs[1] += indices[1]; - - if (startInputs[1]-startInputs[0] == PacketSize-1) { - PacketReturnType result = internal::pset1(0); - convolvePacket(startInputs[0], 0, NumKernelDims-1, result); - return result; - } else { - EIGEN_ALIGN_MAX Scalar data[PacketSize]; - data[0] = Scalar(0); - convolve(startInputs[0], 0, NumKernelDims-1, data[0]); - for (int i = 1; i < PacketSize-1; ++i) { - data[i] = Scalar(0); - convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); - } - data[PacketSize-1] = Scalar(0); - convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); - return internal::pload(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } - startInput += index; - return startInput; - } - - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolve(input, kernel, DimIndex-1, accum); - } else { - accum += m_inputImpl.coeff(input) * m_kernel[kernel]; - } - } - } - - template - EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolvePacket(input, kernel, DimIndex-1, accum); - } else { - accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - array m_inputStride; - array m_outputStride; - - array m_indexStride; - array m_kernelStride; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - Dimensions m_dimensions; - - KernelArgType m_kernelArg; - const Scalar* m_kernel; - bool m_local_kernel; - const Device& m_device; -}; - - - - -// Use an optimized implementation of the evaluation code for GPUs whenever possible. -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) - -template -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { - return StaticKernelSize; - } -}; -template <> -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { - return kernelSize; - } -}; - -template -__global__ void EigenConvolutionKernel1D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int kernelSize, float* buffer) { - HIP_DYNAMIC_SHARED( float, s) - - const int first_x = hipBlockIdx_x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); - const int num_x_output = last_x - first_x + 1; - - const int first_plane = hipBlockIdx_y * hipBlockDim_y; - const int plane_stride = hipBlockDim_y * hipGridDim_y; - - for (int p = first_plane + hipThreadIdx_y; p < numPlanes; p += plane_stride) { - // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = hipThreadIdx_y * num_x_input; - #pragma unroll - for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { - const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); - } - - __syncthreads(); - - // Compute the convolution - const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { - const int kernel_offset = plane_kernel_offset + i; - float result = 0.0f; - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { - result += s[k + kernel_offset] * kernel[k]; - } - const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x); - buffer[tensor_index] = result; - } - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel2D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int numY, const int maxY, const int kernelSizeX, - const int kernelSizeY, float* buffer) { - HIP_DYNAMIC_SHARED( float, s) - - const int first_x = hipBlockIdx_x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); - const int num_x_output = last_x - first_x + 1; - - const int first_y = hipBlockIdx_y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); - const int num_y_output = last_y - first_y + 1; - - const int first_plane = hipBlockIdx_z * hipBlockDim_z; - const int plane_stride = hipBlockDim_z * hipGridDim_z; - - for (int p = first_plane + hipThreadIdx_z; p < numPlanes; p += plane_stride) { - - const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = hipThreadIdx_z * num_y_input; - - // Load inputs to shared memory - #pragma unroll - for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { - const int input_offset = num_x_input * (j + plane_kernel_offset); - #pragma unroll - for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { - const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y); - s[i + input_offset] = eval.coeff(tensor_index); - } - } - - __syncthreads(); - - // Convolution - const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { - #pragma unroll - for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { - float result = 0.0f; - #pragma unroll - for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { - const int kernel_offset = kernelSizeX * l; - const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { - result += s[k + input_offset] * kernel[k + kernel_offset]; - } - } - const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y); - buffer[tensor_index] = result; - } - } - - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel3D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const size_t numPlanes, const size_t numX, - const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, - const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, - const size_t kernelSizeZ, float* buffer) { - HIP_DYNAMIC_SHARED( float, s) - - // Load inputs to shared memory - const int first_x = hipBlockIdx_x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; - - const int first_y = hipBlockIdx_y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; - - const int first_z = hipBlockIdx_z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; - - for (int p = 0; p < numPlanes; ++p) { - - const int plane_input_offset = indexMapper.mapHipInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = 0; - - for (int k = hipThreadIdx_z; k < num_z_input; k += hipBlockDim_z) { - for (int j = hipThreadIdx_y; j < num_y_input; j += hipBlockDim_y) { - for (int i = hipThreadIdx_x; i < num_x_input; i += hipBlockDim_x) { - const int tensor_index = plane_input_offset + indexMapper.mapHipInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); - s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); - } - } - } - - __syncthreads(); - - // Convolution - const int num_z_output = last_z - first_z + 1; - const int num_y_output = last_y - first_y + 1; - const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapHipOutputPlaneToTensorOutputOffset(p); - - for (int k = hipThreadIdx_z; k < num_z_output; k += hipBlockDim_z) { - for (int j = hipThreadIdx_y; j < num_y_output; j += hipBlockDim_y) { - for (int i = hipThreadIdx_x; i < num_x_output; i += hipBlockDim_x) { - float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { - result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; - } - } - } - const int tensor_index = plane_output_offset + indexMapper.mapHipOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); - buffer[tensor_index] = result; - } - } - } - __syncthreads(); - } -}; - - - -template -struct TensorEvaluator, GpuDevice> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {} - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - static unsigned int ceil(unsigned int num, unsigned int denom) { - const unsigned int rounded_toward_zero = num / denom; - if (num > rounded_toward_zero * denom) { - return rounded_toward_zero + 1; - } - return rounded_toward_zero; - } - - void executeEval(Scalar* data) const { - typedef typename TensorEvaluator::Dimensions InputDims; - - const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxHipThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxHipThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumHipMultiProcessors(); - const int hipWarpSize = 32; - - switch (NumKernelDims) { - case 1: { - const int kernel_size = m_kernelImpl.dimensions().TotalSize(); - - const int numX = dimensions()[m_indices[0]]; - const int numP = dimensions().TotalSize() / numX; - int maxX; - dim3 block_size; - - const int single_stride_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 0 - : m_inputImpl.dimensions().rank() - 1; - if (m_indices[0] == single_stride_dim) { - // Maximum the reuse - const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; - maxX = numext::mini(inner_dim, numX); - const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); - block_size.x = numext::mini(maxThreadsPerBlock, maxX); - block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); - } - else { - // Read as much as possible alongside the inner most dimension, that is the plane - const int inner_dim = maxSharedMem / ((hipWarpSize + kernel_size) * sizeof(Scalar)); - const int maxP = numext::mini(inner_dim, numP); - maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); - - block_size.x = numext::mini(hipWarpSize, maxX); - block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); - } - - const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); - - dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); - - - //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[0]); - const array kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch(kernel_size) { - case 4: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 4>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); - break; - } - case 7: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, 7>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); - break; - } - default: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel1D, Index, InputDims, Dynamic>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); - } - } - break; - } - - case 2: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numP = dimensions().TotalSize() / (numX*numY); - - const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); - - // Snap maxX to warp size - int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; - const int maxX = numext::mini(inner_dim, numX); - const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); - const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); - - dim3 block_size; - block_size.x = numext::mini(1024, maxX); - block_size.y = numext::mini(1024/block_size.x, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); - - const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int num_y_blocks = ceil(numY, maxY); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); - - dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); - - - //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[idxX], m_indices[idxY]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch (kernel_size_x) { - case 4: { - switch (kernel_size_y) { - case 7: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, 7>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); - break; - } - default: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); - break; - } - } - break; - } - case 7: { - switch (kernel_size_y) { - case 4: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, 4>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); - break; - } - default: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); - break; - } - } - break; - } - default: { - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); - break; - } - } - break; - } - - case 3: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; - const int idxZ = - static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; - - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numZ = dimensions()[m_indices[idxZ]]; - const int numP = dimensions().TotalSize() / (numX*numY*numZ); - - const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); - const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); - const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); - - dim3 block_size; - block_size.x = numext::mini(32, maxX); - block_size.y = numext::mini(32, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); - dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); - - const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array indices(m_indices[idxX], m_indices[idxY], - m_indices[idxZ]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY], - m_kernelImpl.dimensions()[idxZ]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenConvolutionKernel3D, Index, InputDims>), - dim3(num_blocks), dim3(block_size), shared_mem, m_device.stream(), m_inputImpl, indexMapper, m_kernel, - numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - KernelArgType m_kernelArg; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - - const GpuDevice& m_device; -}; -#endif - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h deleted file mode 100644 index c0e110987..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceHip.h +++ /dev/null @@ -1,352 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H - -#if defined(EIGEN_HIPCC) -#include "hip/hip_runtime.h" -#include "hip/hip_runtime_api.h" -#endif -#include //for sleep function - -namespace Eigen { - -static const int kHipScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// HIP streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const hipStream_t& stream() const = 0; - virtual const hipDeviceProp_t& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static hipDeviceProp_t* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if 0 && __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - hipError_t status = hipGetDeviceCount(&num_devices); - if (status != hipSuccess) { - std::cerr << "Failed to get the number of HIP devices: " - << hipGetErrorString(status) - << std::endl; - assert(status == hipSuccess); - } - m_deviceProperties = new hipDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = hipGetDeviceProperties(&m_deviceProperties[i], i); - if (status != hipSuccess) { - std::cerr << "Failed to initialize HIP device #" - << i - << ": " - << hipGetErrorString(status) - << std::endl; - assert(status == hipSuccess); - } - } - -#if 0 && __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if 0 && __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00; - -class HipStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - hipGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - HipStreamDevice(const hipStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - hipGetDevice(&device_); - } else { - int num_devices; - hipError_t err = hipGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~HipStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const hipStream_t& stream() const { return *stream_; } - const hipDeviceProp_t& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - hipError_t err = hipSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - void* result; - err = hipMalloc(&result, num_bytes); - assert(err == hipSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - hipError_t err = hipSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - assert(buffer != NULL); - err = hipFree(buffer); - assert(err == hipSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kHipScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kHipScratchSize; - semaphore_ = reinterpret_cast(scratch); - //hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int)); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - } - return semaphore_; - } - - private: - const hipStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const hipStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#if !defined(EIGEN_HIP_DEVICE_COMPILE) - hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - hipError_t err = - hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - hipError_t err = - hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#if !defined(EIGEN_HIP_DEVICE_COMPILE) - //TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream()); - hipError_t err = hipMemset(buffer, c, n); - EIGEN_UNUSED_VARIABLE(err) - assert(err == hipSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on hip devices. - return firstLevelCacheSize(); - } - -// FIXME - this will move into HIP -#if defined(EIGEN_HIP_DEVICE_COMPILE) -#undef assert -#define assert(COND) -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(EIGEN_HIPCC) && \ - !defined(EIGEN_HIP_DEVICE_COMPILE) - hipError_t err = hipStreamSynchronize(stream_->stream()); - if (err != hipSuccess) { - std::cerr << "Error detected in HIP stream: " - << hipGetErrorString(err) - << std::endl; - assert(err == hipSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the HIP runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#if defined(EIGEN_HIPCC) - hipError_t error = hipStreamQuery(stream_->stream()); - return (error == hipSuccess) || (error == hipErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \ - assert(hipGetLastError() == hipSuccess); - - -// FIXME: Should be device and kernel specific. -#if defined(EIGEN_HIPCC) -static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) { -#if !defined(EIGEN_HIP_DEVICE_COMPILE) - hipError_t status = hipDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == hipSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h deleted file mode 100644 index 5304a22c5..000000000 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionHip.h +++ /dev/null @@ -1,815 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H - -#if defined(EIGEN_HIP_DEVICE_COMPILE) -#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" -#endif - -#if defined(EIGEN_HIPCC) -#define HIP_WARP_SIZE 64 -#endif - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple hip thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another hip thread -// updated the content of the output address it will try again. -template -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__) - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - assert(0 && "Wordsize not supported"); - } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif -} - -// We extend atomicExch to support extra data types -template -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#if defined(EIGEN_HAS_HIP_FP16) -template