From 8a7f360ec3160acdb545d05712435d8ca0334357 Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Tue, 19 May 2020 19:24:11 +0000 Subject: [PATCH] - Vectorizing MMA packing. - Optimizing MMA kernel. - Adding PacketBlock store to blas_data_mapper. --- Eigen/Core | 4 + Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 311 ++++++++++++++++++++ Eigen/src/Core/util/BlasUtil.h | 54 ++++ test/CMakeLists.txt | 1 + test/blasutil.cpp | 196 ++++++++++++ 5 files changed, 566 insertions(+) create mode 100644 Eigen/src/Core/arch/AltiVec/MatrixProduct.h create mode 100644 test/blasutil.cpp diff --git a/Eigen/Core b/Eigen/Core index 426f9cbc0..2594a7dd9 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -330,6 +330,10 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" +#if defined(EIGEN_ARCH_PPC) +#include "src/Core/arch/AltiVec/MatrixProduct.h" +#endif + #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h new file mode 100644 index 000000000..3bfbfdc87 --- /dev/null +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -0,0 +1,311 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H + +#ifdef __MMA__ + +namespace Eigen { + +namespace internal { + +const int accRows = 4; +const int accCols = 4; +const int accCount = 4; +const int floatVectorSize = 4; + +typedef struct +{ + __vector float v0; + __vector float v1; + __vector float v2; + __vector float v3; +} Packet4fx4; + +union PacketQuad +{ + __struct_quad sc; + Packet4fx4 sf; +}; + +template +struct gemm_pack_lhs +{ + void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_lhs + ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + int ri = 0, j; + for(j = 0; j + floatVectorSize < rows; j+=floatVectorSize) + { + int i; + for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize) + { + PacketBlock block; + block.packet[0] = lhs.template loadPacket(j, i + 0); + block.packet[1] = lhs.template loadPacket(j, i + 1); + block.packet[2] = lhs.template loadPacket(j, i + 2); + block.packet[3] = lhs.template loadPacket(j, i + 3); + + pstore((float *)(blockA + ri ), block.packet[0]); + pstore((float *)(blockA + ri + 4), block.packet[1]); + pstore((float *)(blockA + ri + 8), block.packet[2]); + pstore((float *)(blockA + ri + 12), block.packet[3]); + ri += 4*floatVectorSize; + } + for(; i < depth; i++) + { + Packet4f lhsV = lhs.template loadPacket(j, i); + pstore((float *)(blockA + ri), lhsV); + ri += floatVectorSize; + } + } + for(int i = 0; i < depth; i++) + { + int k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } +} + +template +struct gemm_pack_rhs +{ + void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template +void gemm_pack_rhs + ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) +{ + int ri = 0, j; + for(j = 0; j + floatVectorSize < cols; j+=floatVectorSize) + { + int i; + for(i = 0; i + floatVectorSize < depth; i+=floatVectorSize) + { + PacketBlock block; + block.packet[0] = rhs.template loadPacket(i, j + 0); + block.packet[1] = rhs.template loadPacket(i, j + 1); + block.packet[2] = rhs.template loadPacket(i, j + 2); + block.packet[3] = rhs.template loadPacket(i, j + 3); + + ptranspose(block); + + pstore((float *)(blockB + ri ), block.packet[0]); + pstore((float *)(blockB + ri + 4), block.packet[1]); + pstore((float *)(blockB + ri + 8), block.packet[2]); + pstore((float *)(blockB + ri + 12), block.packet[3]); + + ri += 4*floatVectorSize; + } + for(; i < depth; i++) + { + blockB[ri+0] = rhs(i, j+0); + blockB[ri+1] = rhs(i, j+1); + blockB[ri+2] = rhs(i, j+2); + blockB[ri+3] = rhs(i, j+3); + ri += floatVectorSize; + } + } + for(int i = 0; i < depth; i++) + { + int k = j; + for(; k < cols; k++) + { + blockB[ri] = rhs(i, k); + ri += 1; + } + } +} + +template +EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, Scalar alpha, __vector_quad *acc) +{ + //[TODO] + // + //Packet4fx4 r; + // + //__builtin_mma_disassemble_acc((void *)&r, *acc); + // + PacketQuad result; + result.sc = __builtin_mma_disassemble_acc(*acc); + + Packet4f pAlpha = pset1(alpha); + + PacketBlock block; + block.packet[0] = pAlpha*result.sf.v3; + block.packet[1] = pAlpha*result.sf.v2; + block.packet[2] = pAlpha*result.sf.v1; + block.packet[3] = pAlpha*result.sf.v0; + + data.template storePacketBlock(i, j, block); +} + +template +struct gebp_kernel +{ + void operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); +}; + +template +void gebp_kernel + ::operator()(const DataMapper& res, const float* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, float alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) + { + const int remaining_rows = rows % accRows; + const int remaining_cols = cols % accCols; + const int remaining_depth = depth % floatVectorSize; + const int timesRows = (rows / accRows); + const int timesCols = (cols / accCols); + + int row; + for(row = 0; row + accRows <= rows; row += accRows) + { + const float *rhs_base = blockB; + const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize; + + int col; + for(col = 0; col + accCount*accCols <= cols; col += accCount*accCols){ + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *rhs_ptr2 = rhs_base + ((col/accCols) + 1)*depth*floatVectorSize; + const float *rhs_ptr3 = rhs_base + ((col/accCols) + 2)*depth*floatVectorSize; + const float *rhs_ptr4 = rhs_base + ((col/accCols) + 3)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + + __vector_quad acc, acc2, acc3, acc4; + __builtin_mma_xxsetaccz(&acc); + __builtin_mma_xxsetaccz(&acc2); + __builtin_mma_xxsetaccz(&acc3); + __builtin_mma_xxsetaccz(&acc4); + + for(int k = 0; k < depth; k++) + { + __vector float lhsV = *((__vector float *)lhs_ptr ); + __vector float rhsV = *((__vector float *)rhs_ptr ); + __vector float rhs2V = *((__vector float *)rhs_ptr2); + __vector float rhs3V = *((__vector float *)rhs_ptr3); + __vector float rhs4V = *((__vector float *)rhs_ptr4); + + __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc2, (__vector unsigned char) rhs2V, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc3, (__vector unsigned char) rhs3V, (__vector unsigned char) lhsV); + __builtin_mma_xvf32gerpp(&acc4, (__vector unsigned char) rhs4V, (__vector unsigned char) lhsV); + + lhs_ptr += floatVectorSize; + rhs_ptr += floatVectorSize; + rhs_ptr2 += floatVectorSize; + rhs_ptr3 += floatVectorSize; + rhs_ptr4 += floatVectorSize; + } + + storeAccumulator(row, col , res, alpha, &acc ); + storeAccumulator(row, col + 1*accCols, res, alpha, &acc2); + storeAccumulator(row, col + 2*accCols, res, alpha, &acc3); + storeAccumulator(row, col + 3*accCols, res, alpha, &acc4); + } + for(; col + accCols <= cols; col += accCols){ + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + + __vector_quad acc; + __builtin_mma_xxsetaccz(&acc); + for(int k = 0; k < depth; k++) + { + __vector float lhsV = *((__vector float *)lhs_ptr); + __vector float rhsV = *((__vector float *)rhs_ptr); + + __builtin_mma_xvf32gerpp(&acc, (__vector unsigned char) rhsV, (__vector unsigned char) lhsV); + + lhs_ptr += floatVectorSize; + rhs_ptr += floatVectorSize; + } + + storeAccumulator(row, col, res, alpha, &acc); + } + + if(remaining_cols > 0) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < accRows; arow++) + { + for(int acol = 0; acol < remaining_cols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += remaining_cols; + lhs_ptr += floatVectorSize; + } + } + } + + if(remaining_rows > 0) + { + const float *rhs_base = blockB; + const float *lhs_base = blockA + (row/accRows)*depth*floatVectorSize; + + int col; + for(col = 0; col + accCols <= cols; col += accCols) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < remaining_rows; arow++) + { + for(int acol = 0; acol < accCols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += floatVectorSize; + lhs_ptr += remaining_rows; + } + } + + if(remaining_cols > 0) + { + const float *rhs_ptr = rhs_base + (col/accCols)*depth*floatVectorSize; + const float *lhs_ptr = lhs_base; + for(int k = 0; k < depth; k++) + { + for(int arow = 0; arow < remaining_rows; arow++) + { + for(int acol = 0; acol < remaining_cols; acol++ ) + { + res(row + arow, col + acol) += lhs_ptr[arow]*rhs_ptr[acol]; + } + } + rhs_ptr += remaining_cols; + lhs_ptr += remaining_rows; + } + } + } + } + +} // end namespace internal + +} // end namespace Eigen + +#endif // __MMA__ +#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 643558cba..01e647f17 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -195,6 +195,55 @@ protected: template class blas_data_mapper; +// TMP to help PacketBlock store implementation. +// There's currently no known use case for PacketBlock load. +// The default implementation assumes ColMajor order. +// It always store each packet sequentially one `stride` apart. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + i + (j + idx)*stride, block.packet[idx]); + } +}; + +// PacketBlockManagement specialization to take care of RowMajor order without ifs. +template +struct PacketBlockManagement +{ + PacketBlockManagement pbm; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + pbm.store(to, stride, i, j, block); + pstoreu(to + j + (i + idx)*stride, block.packet[idx]); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + +template +struct PacketBlockManagement +{ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar *to, const Index stride, Index i, Index j, const PacketBlock &block) const { + EIGEN_UNUSED_VARIABLE(to); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(block); + } +}; + template class blas_data_mapper { @@ -258,6 +307,11 @@ public: return internal::first_default_aligned(m_data, size); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j, const PacketBlock &block) const { + PacketBlockManagement pbm; + pbm.store(m_data, m_stride, i, j, block); + } protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d6ef17a70..06c7144ee 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -289,6 +289,7 @@ ei_add_test(half_float) ei_add_test(array_of_string) ei_add_test(num_dimensions) ei_add_test(stl_iterators) +ei_add_test(blasutil) if(EIGEN_TEST_CXX11) ei_add_test(initializer_list_construction) ei_add_test(diagonal_matrix_variadic_ctor) diff --git a/test/blasutil.cpp b/test/blasutil.cpp new file mode 100644 index 000000000..cd8716351 --- /dev/null +++ b/test/blasutil.cpp @@ -0,0 +1,196 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Everton Constantino +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/ + +#include "main.h" + +#include + +using namespace Eigen; + +#define GET(i,j) (StorageOrder == RowMajor ? (i)*stride + (j) : (i) + (j)*stride) +#define SCATTER(i,j,k) (StorageOrder == RowMajor ? ((i)+(k))*stride + (j) : (i) + ((j)+(k))*stride) + +template +void compare(const Packet& a, const Packet& b) +{ + int pktsz = internal::packet_traits::size; + Scalar *buffA = new Scalar[pktsz]; + Scalar *buffB = new Scalar[pktsz]; + + internal::pstoreu(buffA, a); + internal::pstoreu(buffB, b); + + for(int i = 0; i < pktsz; i++) + { + VERIFY_IS_EQUAL(buffA[i], buffB[i]); + } + + delete[] buffA; + delete[] buffB; +} + +template +struct PacketBlockSet +{ + typedef typename internal::packet_traits::type Packet; + + void setPacketBlock(internal::PacketBlock& block, Scalar value) + { + for(int idx = 0; idx < n; idx++) + { + block.packet[idx] = internal::pset1(value); + } + } + + void comparePacketBlock(Scalar *data, int i, int j, int stride, internal::PacketBlock& block) + { + for(int idx = 0; idx < n; idx++) + { + Packet line = internal::ploadu(data + SCATTER(i,j,idx)); + compare(block.packet[idx], line); + } + } +}; + +template +void run_bdmp_spec_1() +{ + typedef internal::blas_data_mapper BlasDataMapper; + int packetSize = internal::packet_traits::size; + int minSize = std::max(packetSize, BlockSize); + typedef typename internal::packet_traits::type Packet; + + int szm = internal::random(minSize,500), szn = internal::random(minSize,500); + int stride = StorageOrder == RowMajor ? szn : szm; + Scalar *d = new Scalar[szn*szm]; + + // Initializing with random entries + for(int i = 0; i < szm*szn; i++) + { + d[i] = internal::random(static_cast(3), static_cast(10)); + } + + BlasDataMapper bdm(d, stride); + + // Testing operator() + for(int i = 0; i < szm; i++) + { + for(int j = 0; j < szn; j++) + { + VERIFY_IS_EQUAL(d[GET(i,j)], bdm(i,j)); + } + } + + // Testing getSubMapper and getLinearMapper + int i0 = internal::random(0,szm-2); + int j0 = internal::random(0,szn-2); + for(int i = i0; i < szm; i++) + { + for(int j = j0; j < szn; j++) + { + const BlasDataMapper& bdmSM = bdm.getSubMapper(i0,j0); + const internal::BlasLinearMapper& bdmLM = bdm.getLinearMapper(i0,j0); + + Scalar v = bdmSM(i - i0, j - j0); + Scalar vd = d[GET(i,j)]; + VERIFY_IS_EQUAL(vd, v); + VERIFY_IS_EQUAL(vd, bdmLM(GET(i-i0, j-j0))); + } + } + + // Testing loadPacket + for(int i = 0; i < szm - minSize; i++) + { + for(int j = 0; j < szn - minSize; j++) + { + Packet pktBDM = bdm.template loadPacket(i,j); + Packet pktD = internal::ploadu(d + GET(i,j)); + + compare(pktBDM, pktD); + } + } + + // Testing gatherPacket + Scalar *buff = new Scalar[packetSize]; + for(int i = 0; i < szm - minSize; i++) + { + for(int j = 0; j < szn - minSize; j++) + { + Packet p = bdm.template gatherPacket(i,j); + internal::pstoreu(buff, p); + + for(int k = 0; k < packetSize; k++) + { + VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], buff[k]); + } + + } + } + delete[] buff; + + // Testing scatterPacket + for(int i = 0; i < szm - minSize; i++) + { + for(int j = 0; j < szn - minSize; j++) + { + Packet p = internal::pset1(static_cast(1)); + bdm.template scatterPacket(i,j,p); + for(int k = 0; k < packetSize; k++) + { + VERIFY_IS_EQUAL(d[SCATTER(i,j,k)], static_cast(1)); + } + } + } + + //Testing storePacketBlock + internal::PacketBlock block; + + PacketBlockSet pbs; + pbs.setPacketBlock(block, static_cast(2)); + + for(int i = 0; i < szm - minSize; i++) + { + for(int j = 0; j < szn - minSize; j++) + { + bdm.template storePacketBlock(i, j, block); + + pbs.comparePacketBlock(d, i, j, stride, block); + } + } + + delete[] d; +} + +template +void run_test() +{ + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); + run_bdmp_spec_1(); +} + +EIGEN_DECLARE_TEST(blasutil) +{ + for(int i = 0; i < g_repeat; i++) + { + CALL_SUBTEST_1(run_test()); + CALL_SUBTEST_2(run_test()); + CALL_SUBTEST_3(run_test()); + CALL_SUBTEST_4(run_test()); + CALL_SUBTEST_5(run_test()); + CALL_SUBTEST_6(run_test()); + } +} \ No newline at end of file