From 31aa17e4efafa25a5f9e27a3ba02b5ca030ad3f5 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Mon, 1 Mar 2010 11:10:30 +0100 Subject: [PATCH] GEMM: move the first packing of A' before the packing of B' --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 418ed720f..da700f8b7 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -107,50 +107,55 @@ static void run(int rows, int cols, int depth, // (==GEMM_VAR1) for(int k=0; k rows of B', and cols of the A' + + // In order to reduce the chance that a thread has to wait for the other, + // let's start by packing A'. + #ifndef USEGOTOROUTINES + pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + #else + sgemm_itcopy(actual_kc, mc, &lhs(0,k), lhsStride, blockA); + #endif + + + // Pack B_k to B' in parallel fashion: // each thread packs the sub block B_k,j to B'_j where j is the thread id. - // Before copying to B'_j, we have to make sure that no other thread is still using it, + // However, before copying to B'_j, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(!info[tid].users.testAndSetOrdered(0,threads)) {} - const int actual_kc = std::min(k+kc,depth)-k; // => rows of B', and cols of the A' - #ifndef USEGOTOROUTINES pack_rhs(blockB+info[tid].rhs_start*kc, &rhs(k,info[tid].rhs_start), rhsStride, alpha, actual_kc, info[tid].rhs_length); #else sgemm_oncopy(actual_kc, info[tid].rhs_length, &rhs(k,info[tid].rhs_start), rhsStride, blockB+info[tid].rhs_start*kc); #endif - // mark that the parts B'_j is uptodate and can be used. + // Notify the other threads that the part B'_j is ready to go. info[tid].sync.fetchAndStoreOrdered(k); - // this is an attempt to implement a smarter strategy as suggested by Aron - // the layout is good, but there is no synchronization yet + // Computes C_i += A' * B' per B'_j + for(int shift=0; shift0) while(!info[j].sync.testAndSetOrdered(k,k)) {} - sgemm_kernel(actual_mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride); - } + #ifndef USEGOTOROUTINES + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*kc, mc, actual_kc, info[j].rhs_length, -1,-1,0,0, w); + #else + sgemm_kernel(mc, info[j].rhs_length, actual_kc, alpha, blockA, blockB+info[j].rhs_start*kc, res+info[j].rhs_start*resStride, resStride); + #endif + } - // then keep going as usual with the remaining A' + // Then keep going as usual with the remaining A' for(int i=mc; i