mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-06-04 18:54:00 +08:00
* split CacheFriendlyProduct into multiple smaller files
* add an efficient selfadjoint * vector implementation (= blas symv) perf are inbetween MKL and GOTO => the interface is still missing (have to be rethougth)
This commit is contained in:
parent
3d86dcf473
commit
de014efdaf
@ -145,7 +145,9 @@ namespace Eigen {
|
|||||||
#include "src/Core/Swap.h"
|
#include "src/Core/Swap.h"
|
||||||
#include "src/Core/CommaInitializer.h"
|
#include "src/Core/CommaInitializer.h"
|
||||||
#include "src/Core/Part.h"
|
#include "src/Core/Part.h"
|
||||||
#include "src/Core/CacheFriendlyProduct.h"
|
#include "src/Core/products/GeneralMatrixMatrix.h"
|
||||||
|
#include "src/Core/products/GeneralMatrixVector.h"
|
||||||
|
#include "src/Core/products/SelfadjointMatrixVector.h"
|
||||||
|
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
|
||||||
|
409
Eigen/src/Core/products/GeneralMatrixMatrix.h
Normal file
409
Eigen/src/Core/products/GeneralMatrixMatrix.h
Normal file
@ -0,0 +1,409 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra. Eigen itself is part of the KDE project.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||||
|
#define EIGEN_GENERAL_MATRIX_MATRIX_H
|
||||||
|
|
||||||
|
template <int L2MemorySize,typename Scalar>
|
||||||
|
struct ei_L2_block_traits {
|
||||||
|
enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
||||||
|
|
||||||
|
template<typename Scalar>
|
||||||
|
static void ei_cache_friendly_product(
|
||||||
|
int _rows, int _cols, int depth,
|
||||||
|
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
|
||||||
|
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
|
||||||
|
bool resRowMajor, Scalar* res, int resStride)
|
||||||
|
{
|
||||||
|
const Scalar* EIGEN_RESTRICT lhs;
|
||||||
|
const Scalar* EIGEN_RESTRICT rhs;
|
||||||
|
int lhsStride, rhsStride, rows, cols;
|
||||||
|
bool lhsRowMajor;
|
||||||
|
|
||||||
|
if (resRowMajor)
|
||||||
|
{
|
||||||
|
lhs = _rhs;
|
||||||
|
rhs = _lhs;
|
||||||
|
lhsStride = _rhsStride;
|
||||||
|
rhsStride = _lhsStride;
|
||||||
|
cols = _rows;
|
||||||
|
rows = _cols;
|
||||||
|
lhsRowMajor = !_rhsRowMajor;
|
||||||
|
ei_assert(_lhsRowMajor);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lhs = _lhs;
|
||||||
|
rhs = _rhs;
|
||||||
|
lhsStride = _lhsStride;
|
||||||
|
rhsStride = _rhsStride;
|
||||||
|
rows = _rows;
|
||||||
|
cols = _cols;
|
||||||
|
lhsRowMajor = _lhsRowMajor;
|
||||||
|
ei_assert(!_rhsRowMajor);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
||||||
|
|
||||||
|
enum {
|
||||||
|
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
||||||
|
#if (defined __i386__)
|
||||||
|
// i386 architecture provides only 8 xmm registers,
|
||||||
|
// so let's reduce the max number of rows processed at once.
|
||||||
|
MaxBlockRows = 4,
|
||||||
|
MaxBlockRows_ClampingMask = 0xFFFFFC,
|
||||||
|
#else
|
||||||
|
MaxBlockRows = 8,
|
||||||
|
MaxBlockRows_ClampingMask = 0xFFFFF8,
|
||||||
|
#endif
|
||||||
|
// maximal size of the blocks fitted in L2 cache
|
||||||
|
MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
|
||||||
|
};
|
||||||
|
|
||||||
|
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
|
||||||
|
|
||||||
|
const int remainingSize = depth % PacketSize;
|
||||||
|
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
|
||||||
|
const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
|
||||||
|
const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
|
||||||
|
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
|
||||||
|
const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
|
||||||
|
const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
|
||||||
|
Scalar* EIGEN_RESTRICT block = 0;
|
||||||
|
const int allocBlockSize = l2BlockRows*size;
|
||||||
|
block = ei_aligned_stack_new(Scalar, allocBlockSize);
|
||||||
|
Scalar* EIGEN_RESTRICT rhsCopy
|
||||||
|
= ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
|
||||||
|
|
||||||
|
#ifndef EIGEN_USE_NEW_PRODUCT
|
||||||
|
// loops on each L2 cache friendly blocks of the result
|
||||||
|
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
||||||
|
{
|
||||||
|
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
|
||||||
|
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
|
||||||
|
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
|
||||||
|
//const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
|
||||||
|
|
||||||
|
// build a cache friendly blocky matrix
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
// copy l2blocksize rows of m_lhs to blocks of ps x bw
|
||||||
|
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||||
|
{
|
||||||
|
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||||
|
|
||||||
|
for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
|
||||||
|
{
|
||||||
|
// TODO merge the "if l2blockRemainingRows" using something like:
|
||||||
|
// const int blockRows = std::min(i+MaxBlockRows, rows) - i;
|
||||||
|
|
||||||
|
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
// TODO write these loops using meta unrolling
|
||||||
|
// negligible for large matrices but useful for small ones
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int w=0; w<MaxBlockRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(i+w)*lhsStride + (k+s)];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int w=0; w<MaxBlockRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(i+w) + (k+s)*lhsStride];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (l2blockRemainingRows>0)
|
||||||
|
{
|
||||||
|
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int w=0; w<l2blockRemainingRows; ++w)
|
||||||
|
for (int s=0; s<PacketSize; ++s)
|
||||||
|
block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||||
|
{
|
||||||
|
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
|
||||||
|
|
||||||
|
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
||||||
|
{
|
||||||
|
// acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
|
||||||
|
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
||||||
|
|
||||||
|
// if not aligned, copy the rhs block
|
||||||
|
if (needRhsCopy)
|
||||||
|
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||||
|
{
|
||||||
|
ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
|
||||||
|
memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// for each bw x 1 result's block
|
||||||
|
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
|
||||||
|
{
|
||||||
|
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
|
||||||
|
const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
|
||||||
|
|
||||||
|
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||||
|
{
|
||||||
|
const Scalar* EIGEN_RESTRICT rhsColumn;
|
||||||
|
if (needRhsCopy)
|
||||||
|
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
||||||
|
else
|
||||||
|
rhsColumn = &(rhs[l1j*rhsStride]);
|
||||||
|
|
||||||
|
PacketType dst[MaxBlockRows];
|
||||||
|
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
||||||
|
|
||||||
|
PacketType tmp;
|
||||||
|
|
||||||
|
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
tmp = ei_ploadu(&rhsColumn[k]);
|
||||||
|
PacketType A0, A1, A2, A3, A4, A5;
|
||||||
|
A0 = ei_pload(localB + k*MaxBlockRows);
|
||||||
|
A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
|
||||||
|
A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
|
||||||
|
A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
|
||||||
|
if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
|
||||||
|
if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
|
||||||
|
dst[0] = ei_pmadd(tmp, A0, dst[0]);
|
||||||
|
if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
|
||||||
|
dst[1] = ei_pmadd(tmp, A1, dst[1]);
|
||||||
|
if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
|
||||||
|
dst[2] = ei_pmadd(tmp, A2, dst[2]);
|
||||||
|
dst[3] = ei_pmadd(tmp, A3, dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
dst[4] = ei_pmadd(tmp, A4, dst[4]);
|
||||||
|
dst[5] = ei_pmadd(tmp, A5, dst[5]);
|
||||||
|
dst[6] = ei_pmadd(tmp, A0, dst[6]);
|
||||||
|
dst[7] = ei_pmadd(tmp, A1, dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
|
||||||
|
|
||||||
|
if (PacketSize>1 && resIsAligned)
|
||||||
|
{
|
||||||
|
// the result is aligned: let's do packet reduction
|
||||||
|
ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
|
||||||
|
if (PacketSize==2)
|
||||||
|
ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
|
||||||
|
if (PacketSize==2)
|
||||||
|
ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// not aligned => per coeff packet reduction
|
||||||
|
localRes[0] += ei_predux(dst[0]);
|
||||||
|
localRes[1] += ei_predux(dst[1]);
|
||||||
|
localRes[2] += ei_predux(dst[2]);
|
||||||
|
localRes[3] += ei_predux(dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
localRes[4] += ei_predux(dst[4]);
|
||||||
|
localRes[5] += ei_predux(dst[5]);
|
||||||
|
localRes[6] += ei_predux(dst[6]);
|
||||||
|
localRes[7] += ei_predux(dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (l2blockRemainingRows>0)
|
||||||
|
{
|
||||||
|
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
|
||||||
|
const Scalar* localB = &block[offsetblock];
|
||||||
|
|
||||||
|
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
||||||
|
{
|
||||||
|
const Scalar* EIGEN_RESTRICT rhsColumn;
|
||||||
|
if (needRhsCopy)
|
||||||
|
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
||||||
|
else
|
||||||
|
rhsColumn = &(rhs[l1j*rhsStride]);
|
||||||
|
|
||||||
|
PacketType dst[MaxBlockRows];
|
||||||
|
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
||||||
|
|
||||||
|
// let's declare a few other temporary registers
|
||||||
|
PacketType tmp;
|
||||||
|
|
||||||
|
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
||||||
|
{
|
||||||
|
tmp = ei_pload(&rhsColumn[k]);
|
||||||
|
|
||||||
|
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
|
||||||
|
if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
|
||||||
|
if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
|
||||||
|
if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
|
||||||
|
if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
|
||||||
|
if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
|
||||||
|
if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
|
||||||
|
|
||||||
|
// process the remaining rows once at a time
|
||||||
|
localRes[0] += ei_predux(dst[0]);
|
||||||
|
if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
|
||||||
|
if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
|
||||||
|
if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
|
||||||
|
if (MaxBlockRows==8)
|
||||||
|
{
|
||||||
|
if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
|
||||||
|
if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
|
||||||
|
if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
|
||||||
|
if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (PacketSize>1 && remainingSize)
|
||||||
|
{
|
||||||
|
if (lhsRowMajor)
|
||||||
|
{
|
||||||
|
for (int j=0; j<cols; ++j)
|
||||||
|
for (int i=0; i<rows; ++i)
|
||||||
|
{
|
||||||
|
Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
|
||||||
|
// FIXME this loop get vectorized by the compiler !
|
||||||
|
for (int k=1; k<remainingSize; ++k)
|
||||||
|
tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
|
||||||
|
res[i+j*resStride] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int j=0; j<cols; ++j)
|
||||||
|
for (int i=0; i<rows; ++i)
|
||||||
|
{
|
||||||
|
Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
|
||||||
|
for (int k=1; k<remainingSize; ++k)
|
||||||
|
tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
|
||||||
|
res[i+j*resStride] += tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// loops on each L2 cache friendly blocks of the result
|
||||||
|
|
||||||
|
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
||||||
|
{
|
||||||
|
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
||||||
|
{
|
||||||
|
// We have selected a block of lhs
|
||||||
|
// Packs this block into 'block'
|
||||||
|
for(int j=0; j<l2BlockCols; ++j)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
for(int i=0; i<l2BlockRows; ++i)
|
||||||
|
{
|
||||||
|
block[ (j*l2BlockCols) + i] = lhs[(j+l2j)*rows+l2i+count++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// loops on each L2 cache firendly block of the result/rhs
|
||||||
|
for(int l2k=0; l2k<cols; l2k+=l2BlockCols)
|
||||||
|
{
|
||||||
|
for(int j=0; j<l2BlockCols; ++j)
|
||||||
|
{
|
||||||
|
for(int i=0; i<l2BlockRows; i+=PacketSize)
|
||||||
|
{
|
||||||
|
PacketType A0, A1, A2, A3, A4, A5;
|
||||||
|
|
||||||
|
// Load the packets from rhs and reorder them
|
||||||
|
|
||||||
|
// Here we need some vector reordering
|
||||||
|
// Right now its hardcoded to packets of 4 elements
|
||||||
|
A0 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)]);
|
||||||
|
A1 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+1]);
|
||||||
|
A2 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+2]);
|
||||||
|
A3 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+3]);
|
||||||
|
|
||||||
|
for(int k=0; k<l2BlockRows; k+=PacketSize)
|
||||||
|
{
|
||||||
|
PacketType L0, L1, L2, L3;
|
||||||
|
|
||||||
|
// We perform "cross products" of vectors to avoid
|
||||||
|
// reductions (horizontal ops) afterwards
|
||||||
|
A4 = ei_pload(&res[(j+l2k)*rows+l2i+k]);
|
||||||
|
L0 = ei_pload(&block[ k + (i + 0)*l2BlockRows ]);
|
||||||
|
L1 = ei_pload(&block[ k + (i + 1)*l2BlockRows ]);
|
||||||
|
A4 = ei_pmadd(L0, A0, A4);
|
||||||
|
L2 = ei_pload(&block[ k + (i + 2)*l2BlockRows ]);
|
||||||
|
A4 = ei_pmadd(L1, A1, A4);
|
||||||
|
L3 = ei_pload(&block[ k + (i + 3)*l2BlockRows ]);
|
||||||
|
A4 = ei_pmadd(L2, A2, A4);
|
||||||
|
A4 = ei_pmadd(L3, A3, A4);
|
||||||
|
|
||||||
|
ei_pstore(&res[(j+l2k)*rows+l2i+k], A4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ei_aligned_stack_delete(Scalar, block, allocBlockSize);
|
||||||
|
ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
||||||
|
|
||||||
|
#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
|
@ -22,389 +22,8 @@
|
|||||||
// License and a copy of the GNU General Public License along with
|
// License and a copy of the GNU General Public License along with
|
||||||
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
||||||
#define EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
||||||
|
|
||||||
template <int L2MemorySize,typename Scalar>
|
|
||||||
struct ei_L2_block_traits {
|
|
||||||
enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret };
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifndef EIGEN_EXTERN_INSTANTIATIONS
|
|
||||||
|
|
||||||
template<typename Scalar>
|
|
||||||
static void ei_cache_friendly_product(
|
|
||||||
int _rows, int _cols, int depth,
|
|
||||||
bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride,
|
|
||||||
bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride,
|
|
||||||
bool resRowMajor, Scalar* res, int resStride)
|
|
||||||
{
|
|
||||||
const Scalar* EIGEN_RESTRICT lhs;
|
|
||||||
const Scalar* EIGEN_RESTRICT rhs;
|
|
||||||
int lhsStride, rhsStride, rows, cols;
|
|
||||||
bool lhsRowMajor;
|
|
||||||
|
|
||||||
if (resRowMajor)
|
|
||||||
{
|
|
||||||
lhs = _rhs;
|
|
||||||
rhs = _lhs;
|
|
||||||
lhsStride = _rhsStride;
|
|
||||||
rhsStride = _lhsStride;
|
|
||||||
cols = _rows;
|
|
||||||
rows = _cols;
|
|
||||||
lhsRowMajor = !_rhsRowMajor;
|
|
||||||
ei_assert(_lhsRowMajor);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
lhs = _lhs;
|
|
||||||
rhs = _rhs;
|
|
||||||
lhsStride = _lhsStride;
|
|
||||||
rhsStride = _rhsStride;
|
|
||||||
rows = _rows;
|
|
||||||
cols = _cols;
|
|
||||||
lhsRowMajor = _lhsRowMajor;
|
|
||||||
ei_assert(!_rhsRowMajor);
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef typename ei_packet_traits<Scalar>::type PacketType;
|
|
||||||
|
|
||||||
enum {
|
|
||||||
PacketSize = sizeof(PacketType)/sizeof(Scalar),
|
|
||||||
#if (defined __i386__)
|
|
||||||
// i386 architecture provides only 8 xmm registers,
|
|
||||||
// so let's reduce the max number of rows processed at once.
|
|
||||||
MaxBlockRows = 4,
|
|
||||||
MaxBlockRows_ClampingMask = 0xFFFFFC,
|
|
||||||
#else
|
|
||||||
MaxBlockRows = 8,
|
|
||||||
MaxBlockRows_ClampingMask = 0xFFFFF8,
|
|
||||||
#endif
|
|
||||||
// maximal size of the blocks fitted in L2 cache
|
|
||||||
MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width
|
|
||||||
};
|
|
||||||
|
|
||||||
const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0));
|
|
||||||
|
|
||||||
const int remainingSize = depth % PacketSize;
|
|
||||||
const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries
|
|
||||||
const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize;
|
|
||||||
const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize;
|
|
||||||
const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize;
|
|
||||||
const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize;
|
|
||||||
const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0));
|
|
||||||
Scalar* EIGEN_RESTRICT block = 0;
|
|
||||||
const int allocBlockSize = l2BlockRows*size;
|
|
||||||
block = ei_aligned_stack_new(Scalar, allocBlockSize);
|
|
||||||
Scalar* EIGEN_RESTRICT rhsCopy
|
|
||||||
= ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned);
|
|
||||||
|
|
||||||
#ifndef EIGEN_USE_NEW_PRODUCT
|
|
||||||
// loops on each L2 cache friendly blocks of the result
|
|
||||||
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
|
||||||
{
|
|
||||||
const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows);
|
|
||||||
const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw
|
|
||||||
const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows
|
|
||||||
//const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows);
|
|
||||||
|
|
||||||
// build a cache friendly blocky matrix
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
// copy l2blocksize rows of m_lhs to blocks of ps x bw
|
|
||||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
|
||||||
{
|
|
||||||
const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
|
||||||
|
|
||||||
for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows)
|
|
||||||
{
|
|
||||||
// TODO merge the "if l2blockRemainingRows" using something like:
|
|
||||||
// const int blockRows = std::min(i+MaxBlockRows, rows) - i;
|
|
||||||
|
|
||||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
// TODO write these loops using meta unrolling
|
|
||||||
// negligible for large matrices but useful for small ones
|
|
||||||
if (lhsRowMajor)
|
|
||||||
{
|
|
||||||
for (int w=0; w<MaxBlockRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = lhs[(i+w)*lhsStride + (k+s)];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (int w=0; w<MaxBlockRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = lhs[(i+w) + (k+s)*lhsStride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (l2blockRemainingRows>0)
|
|
||||||
{
|
|
||||||
for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
if (lhsRowMajor)
|
|
||||||
{
|
|
||||||
for (int w=0; w<l2blockRemainingRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (int w=0; w<l2blockRemainingRows; ++w)
|
|
||||||
for (int s=0; s<PacketSize; ++s)
|
|
||||||
block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
|
||||||
{
|
|
||||||
int l2blockColEnd = std::min(l2j+l2BlockCols, cols);
|
|
||||||
|
|
||||||
for(int l2k=0; l2k<size; l2k+=l2BlockSize)
|
|
||||||
{
|
|
||||||
// acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res
|
|
||||||
int l2blockSizeEnd = std::min(l2k+l2BlockSize, size);
|
|
||||||
|
|
||||||
// if not aligned, copy the rhs block
|
|
||||||
if (needRhsCopy)
|
|
||||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
|
||||||
{
|
|
||||||
ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned);
|
|
||||||
memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar));
|
|
||||||
}
|
|
||||||
|
|
||||||
// for each bw x 1 result's block
|
|
||||||
for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows)
|
|
||||||
{
|
|
||||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows;
|
|
||||||
const Scalar* EIGEN_RESTRICT localB = &block[offsetblock];
|
|
||||||
|
|
||||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
|
||||||
{
|
|
||||||
const Scalar* EIGEN_RESTRICT rhsColumn;
|
|
||||||
if (needRhsCopy)
|
|
||||||
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
|
||||||
else
|
|
||||||
rhsColumn = &(rhs[l1j*rhsStride]);
|
|
||||||
|
|
||||||
PacketType dst[MaxBlockRows];
|
|
||||||
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
|
||||||
|
|
||||||
PacketType tmp;
|
|
||||||
|
|
||||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
tmp = ei_ploadu(&rhsColumn[k]);
|
|
||||||
PacketType A0, A1, A2, A3, A4, A5;
|
|
||||||
A0 = ei_pload(localB + k*MaxBlockRows);
|
|
||||||
A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize);
|
|
||||||
A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize);
|
|
||||||
A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize);
|
|
||||||
if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize);
|
|
||||||
if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize);
|
|
||||||
dst[0] = ei_pmadd(tmp, A0, dst[0]);
|
|
||||||
if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize);
|
|
||||||
dst[1] = ei_pmadd(tmp, A1, dst[1]);
|
|
||||||
if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize);
|
|
||||||
dst[2] = ei_pmadd(tmp, A2, dst[2]);
|
|
||||||
dst[3] = ei_pmadd(tmp, A3, dst[3]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
dst[4] = ei_pmadd(tmp, A4, dst[4]);
|
|
||||||
dst[5] = ei_pmadd(tmp, A5, dst[5]);
|
|
||||||
dst[6] = ei_pmadd(tmp, A0, dst[6]);
|
|
||||||
dst[7] = ei_pmadd(tmp, A1, dst[7]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]);
|
|
||||||
|
|
||||||
if (PacketSize>1 && resIsAligned)
|
|
||||||
{
|
|
||||||
// the result is aligned: let's do packet reduction
|
|
||||||
ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0])));
|
|
||||||
if (PacketSize==2)
|
|
||||||
ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2]))));
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4]))));
|
|
||||||
if (PacketSize==2)
|
|
||||||
ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6]))));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// not aligned => per coeff packet reduction
|
|
||||||
localRes[0] += ei_predux(dst[0]);
|
|
||||||
localRes[1] += ei_predux(dst[1]);
|
|
||||||
localRes[2] += ei_predux(dst[2]);
|
|
||||||
localRes[3] += ei_predux(dst[3]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
localRes[4] += ei_predux(dst[4]);
|
|
||||||
localRes[5] += ei_predux(dst[5]);
|
|
||||||
localRes[6] += ei_predux(dst[6]);
|
|
||||||
localRes[7] += ei_predux(dst[7]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (l2blockRemainingRows>0)
|
|
||||||
{
|
|
||||||
int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows;
|
|
||||||
const Scalar* localB = &block[offsetblock];
|
|
||||||
|
|
||||||
for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1)
|
|
||||||
{
|
|
||||||
const Scalar* EIGEN_RESTRICT rhsColumn;
|
|
||||||
if (needRhsCopy)
|
|
||||||
rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]);
|
|
||||||
else
|
|
||||||
rhsColumn = &(rhs[l1j*rhsStride]);
|
|
||||||
|
|
||||||
PacketType dst[MaxBlockRows];
|
|
||||||
dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.));
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
dst[7] = dst[6] = dst[5] = dst[4] = dst[0];
|
|
||||||
|
|
||||||
// let's declare a few other temporary registers
|
|
||||||
PacketType tmp;
|
|
||||||
|
|
||||||
for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize)
|
|
||||||
{
|
|
||||||
tmp = ei_pload(&rhsColumn[k]);
|
|
||||||
|
|
||||||
dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]);
|
|
||||||
if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]);
|
|
||||||
if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]);
|
|
||||||
if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]);
|
|
||||||
if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]);
|
|
||||||
if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]);
|
|
||||||
if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]);
|
|
||||||
|
|
||||||
// process the remaining rows once at a time
|
|
||||||
localRes[0] += ei_predux(dst[0]);
|
|
||||||
if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]);
|
|
||||||
if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]);
|
|
||||||
if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]);
|
|
||||||
if (MaxBlockRows==8)
|
|
||||||
{
|
|
||||||
if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]);
|
|
||||||
if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]);
|
|
||||||
if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]);
|
|
||||||
if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (PacketSize>1 && remainingSize)
|
|
||||||
{
|
|
||||||
if (lhsRowMajor)
|
|
||||||
{
|
|
||||||
for (int j=0; j<cols; ++j)
|
|
||||||
for (int i=0; i<rows; ++i)
|
|
||||||
{
|
|
||||||
Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size];
|
|
||||||
// FIXME this loop get vectorized by the compiler !
|
|
||||||
for (int k=1; k<remainingSize; ++k)
|
|
||||||
tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k];
|
|
||||||
res[i+j*resStride] += tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (int j=0; j<cols; ++j)
|
|
||||||
for (int i=0; i<rows; ++i)
|
|
||||||
{
|
|
||||||
Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size];
|
|
||||||
for (int k=1; k<remainingSize; ++k)
|
|
||||||
tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k];
|
|
||||||
res[i+j*resStride] += tmp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
// loops on each L2 cache friendly blocks of the result
|
|
||||||
|
|
||||||
for(int l2i=0; l2i<rows; l2i+=l2BlockRows)
|
|
||||||
{
|
|
||||||
for(int l2j=0; l2j<cols; l2j+=l2BlockCols)
|
|
||||||
{
|
|
||||||
// We have selected a block of lhs
|
|
||||||
// Packs this block into 'block'
|
|
||||||
for(int j=0; j<l2BlockCols; ++j)
|
|
||||||
{
|
|
||||||
int count = 0;
|
|
||||||
for(int i=0; i<l2BlockRows; ++i)
|
|
||||||
{
|
|
||||||
block[ (j*l2BlockCols) + i] = lhs[(j+l2j)*rows+l2i+count++];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// loops on each L2 cache firendly block of the result/rhs
|
|
||||||
for(int l2k=0; l2k<cols; l2k+=l2BlockCols)
|
|
||||||
{
|
|
||||||
for(int j=0; j<l2BlockCols; ++j)
|
|
||||||
{
|
|
||||||
for(int i=0; i<l2BlockRows; i+=PacketSize)
|
|
||||||
{
|
|
||||||
PacketType A0, A1, A2, A3, A4, A5;
|
|
||||||
|
|
||||||
// Load the packets from rhs and reorder them
|
|
||||||
|
|
||||||
// Here we need some vector reordering
|
|
||||||
// Right now its hardcoded to packets of 4 elements
|
|
||||||
A0 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)]);
|
|
||||||
A1 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+1]);
|
|
||||||
A2 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+2]);
|
|
||||||
A3 = ei_pset1(rhs[(j+l2k)*rows+(i+l2j)+3]);
|
|
||||||
|
|
||||||
for(int k=0; k<l2BlockRows; k+=PacketSize)
|
|
||||||
{
|
|
||||||
PacketType L0, L1, L2, L3;
|
|
||||||
|
|
||||||
// We perform "cross products" of vectors to avoid
|
|
||||||
// reductions (horizontal ops) afterwards
|
|
||||||
A4 = ei_pload(&res[(j+l2k)*rows+l2i+k]);
|
|
||||||
L0 = ei_pload(&block[ k + (i + 0)*l2BlockRows ]);
|
|
||||||
L1 = ei_pload(&block[ k + (i + 1)*l2BlockRows ]);
|
|
||||||
A4 = ei_pmadd(L0, A0, A4);
|
|
||||||
L2 = ei_pload(&block[ k + (i + 2)*l2BlockRows ]);
|
|
||||||
A4 = ei_pmadd(L1, A1, A4);
|
|
||||||
L3 = ei_pload(&block[ k + (i + 3)*l2BlockRows ]);
|
|
||||||
A4 = ei_pmadd(L2, A2, A4);
|
|
||||||
A4 = ei_pmadd(L3, A3, A4);
|
|
||||||
|
|
||||||
ei_pstore(&res[(j+l2k)*rows+l2i+k], A4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ei_aligned_stack_delete(Scalar, block, allocBlockSize);
|
|
||||||
ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // EIGEN_EXTERN_INSTANTIATIONS
|
|
||||||
|
|
||||||
/* Optimized col-major matrix * vector product:
|
/* Optimized col-major matrix * vector product:
|
||||||
* This algorithm processes 4 columns at onces that allows to both reduce
|
* This algorithm processes 4 columns at onces that allows to both reduce
|
||||||
@ -813,4 +432,4 @@ static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector(
|
|||||||
#undef _EIGEN_ACCUMULATE_PACKETS
|
#undef _EIGEN_ACCUMULATE_PACKETS
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H
|
#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|
177
Eigen/src/Core/products/SelfadjointMatrixVector.h
Normal file
177
Eigen/src/Core/products/SelfadjointMatrixVector.h
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra. Eigen itself is part of the KDE project.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
|
||||||
|
#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
|
||||||
|
|
||||||
|
template<bool Conjugate> struct ei_conj_if {
|
||||||
|
template<typename Scalar> Scalar operator() (const Scalar& x) const { return ei_conj(x); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct ei_conj_if<false> {
|
||||||
|
template<typename Scalar> Scalar& operator() (Scalar& x) const { return x; }
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Optimized col-major selfadjoint matrix * vector product:
|
||||||
|
* This algorithm processes 2 columns at onces that allows to both reduce
|
||||||
|
* the number of load/stores of the result by a factor 2 and to reduce
|
||||||
|
* the instruction dependency.
|
||||||
|
*/
|
||||||
|
template<typename Scalar, int StorageOrder, int UpLo>
|
||||||
|
static EIGEN_DONT_INLINE void ei_product_selfadjoint_vector(
|
||||||
|
int size,
|
||||||
|
const Scalar* lhs, int lhsStride,
|
||||||
|
const Scalar* rhs, //int rhsIncr,
|
||||||
|
Scalar* res)
|
||||||
|
{
|
||||||
|
typedef typename ei_packet_traits<Scalar>::type Packet;
|
||||||
|
const int PacketSize = sizeof(Packet)/sizeof(Scalar);
|
||||||
|
|
||||||
|
enum {
|
||||||
|
IsRowMajor = StorageOrder==RowMajorBit ? 1 : 0,
|
||||||
|
IsLower = UpLo == LowerTriangularBit ? 1 : 0,
|
||||||
|
FirstTriangular = IsRowMajor == IsLower
|
||||||
|
};
|
||||||
|
|
||||||
|
ei_conj_if<NumTraits<Scalar>::IsComplex && IsRowMajor> conj0;
|
||||||
|
ei_conj_if<NumTraits<Scalar>::IsComplex && !IsRowMajor> conj1;
|
||||||
|
|
||||||
|
for (int i=0;i<size;i++)
|
||||||
|
res[i] = 0;
|
||||||
|
|
||||||
|
int bound = std::max(0,size-8) & 0xfffffffE;
|
||||||
|
if (FirstTriangular)
|
||||||
|
bound = size - bound;
|
||||||
|
|
||||||
|
for (int j=FirstTriangular ? bound : 0;
|
||||||
|
j<(FirstTriangular ? size : bound);j+=2)
|
||||||
|
{
|
||||||
|
register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
|
||||||
|
register const Scalar* __restrict__ A1 = lhs + (j+1)*lhsStride;
|
||||||
|
|
||||||
|
Scalar t0 = rhs[j];
|
||||||
|
Packet ptmp0 = ei_pset1(t0);
|
||||||
|
Scalar t1 = rhs[j+1];
|
||||||
|
Packet ptmp1 = ei_pset1(t1);
|
||||||
|
|
||||||
|
Scalar t2 = 0;
|
||||||
|
Packet ptmp2 = ei_pset1(t2);
|
||||||
|
Scalar t3 = 0;
|
||||||
|
Packet ptmp3 = ei_pset1(t3);
|
||||||
|
|
||||||
|
size_t starti = FirstTriangular ? 0 : j+2;
|
||||||
|
size_t endi = FirstTriangular ? j : size;
|
||||||
|
size_t alignedEnd = starti;
|
||||||
|
size_t alignedStart = (starti) + ei_alignmentOffset(&res[starti], endi-starti);
|
||||||
|
alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
|
||||||
|
|
||||||
|
res[j] += t0 * conj0(A0[j]);
|
||||||
|
if(FirstTriangular)
|
||||||
|
{
|
||||||
|
res[j+1] += t1 * conj0(A1[j+1]);
|
||||||
|
res[j] += t1 * conj0(A1[j]);
|
||||||
|
t3 += conj1(A1[j]) * rhs[j];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
res[j+1] += t0 * conj0(A0[j+1]) + t1 * conj0(A1[j+1]);
|
||||||
|
t2 += conj1(A0[j+1]) * rhs[j+1];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i=starti; i<alignedStart; ++i)
|
||||||
|
{
|
||||||
|
res[i] += t0 * A0[i] + t1 * A1[i];
|
||||||
|
t2 += ei_conj(A0[i]) * rhs[i];
|
||||||
|
t3 += ei_conj(A1[i]) * rhs[i];
|
||||||
|
}
|
||||||
|
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
|
||||||
|
{
|
||||||
|
Packet A0i = ei_ploadu(&A0[i]);
|
||||||
|
Packet A1i = ei_ploadu(&A1[i]);
|
||||||
|
Packet Bi = ei_ploadu(&rhs[i]); // FIXME should be aligned in most cases
|
||||||
|
Packet Xi = ei_pload(&res[i]);
|
||||||
|
|
||||||
|
Xi = ei_padd(ei_padd(Xi, ei_pmul(ptmp0, conj0(A0i))), ei_pmul(ptmp1, conj0(A1i)));
|
||||||
|
ptmp2 = ei_padd(ptmp2, ei_pmul(conj1(A0i), Bi));
|
||||||
|
ptmp3 = ei_padd(ptmp3, ei_pmul(conj1(A1i), Bi));
|
||||||
|
ei_pstore(&res[i],Xi);
|
||||||
|
}
|
||||||
|
for (size_t i=alignedEnd; i<endi; i++)
|
||||||
|
{
|
||||||
|
res[i] += t0 * conj0(A0[i]) + t1 * conj0(A1[i]);
|
||||||
|
t2 += conj1(A0[i]) * rhs[i];
|
||||||
|
t3 += conj1(A1[i]) * rhs[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
res[j] += t2 + ei_predux(ptmp2);
|
||||||
|
res[j+1] += t3 + ei_predux(ptmp3);
|
||||||
|
}
|
||||||
|
for (int j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
|
||||||
|
{
|
||||||
|
register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
|
||||||
|
|
||||||
|
Scalar t1 = rhs[j];
|
||||||
|
Scalar t2 = 0;
|
||||||
|
res[j] += t1 * conj0(A0[j]);
|
||||||
|
for (int i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) {
|
||||||
|
res[i] += t1 * conj0(A0[i]);
|
||||||
|
t2 += conj1(A0[i]) * rhs[i];
|
||||||
|
}
|
||||||
|
res[j] += t2;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
// colmajor - upper
|
||||||
|
for (int j=0;j<size;j++)
|
||||||
|
{
|
||||||
|
register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
|
||||||
|
|
||||||
|
Scalar t1 = rhs[j];
|
||||||
|
Scalar t2 = 0;
|
||||||
|
for (int i=0; i<j; i+=PacketSize) {
|
||||||
|
res[i] += t1 * A0[i];
|
||||||
|
t2 += A0[i] * rhs[i];
|
||||||
|
}
|
||||||
|
res[j] += t1 * A0[j];
|
||||||
|
res[j] += t2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// rowmajor - lower
|
||||||
|
for (int j=0;j<size;j++)
|
||||||
|
{
|
||||||
|
register const Scalar* __restrict__ A0 = lhs + j*lhsStride;
|
||||||
|
|
||||||
|
Scalar t1 = rhs[j];
|
||||||
|
Scalar t2 = 0;
|
||||||
|
for (int i=0; i<j; i+=PacketSize) {
|
||||||
|
res[i] += t1 * A0[i];
|
||||||
|
t2 += A0[i] * rhs[i];
|
||||||
|
}
|
||||||
|
res[j] += t1 * A0[j];
|
||||||
|
res[j] += t2;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
|
@ -114,6 +114,7 @@ ei_add_test(cwiseop)
|
|||||||
ei_add_test(redux)
|
ei_add_test(redux)
|
||||||
ei_add_test(product_small)
|
ei_add_test(product_small)
|
||||||
ei_add_test(product_large ${EI_OFLAG})
|
ei_add_test(product_large ${EI_OFLAG})
|
||||||
|
ei_add_test(product_selfadjoint)
|
||||||
ei_add_test(adjoint)
|
ei_add_test(adjoint)
|
||||||
ei_add_test(submatrices)
|
ei_add_test(submatrices)
|
||||||
ei_add_test(miscmatrices)
|
ei_add_test(miscmatrices)
|
||||||
|
70
test/product_selfadjoint.cpp
Normal file
70
test/product_selfadjoint.cpp
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra. Eigen itself is part of the KDE project.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@gmail.com>
|
||||||
|
//
|
||||||
|
// Eigen is free software; you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU Lesser General Public
|
||||||
|
// License as published by the Free Software Foundation; either
|
||||||
|
// version 3 of the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Alternatively, you can redistribute it and/or
|
||||||
|
// modify it under the terms of the GNU General Public License as
|
||||||
|
// published by the Free Software Foundation; either version 2 of
|
||||||
|
// the License, or (at your option) any later version.
|
||||||
|
//
|
||||||
|
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||||
|
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public
|
||||||
|
// License and a copy of the GNU General Public License along with
|
||||||
|
// Eigen. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#include "main.h"
|
||||||
|
|
||||||
|
template<typename MatrixType> void product_selfadjoint(const MatrixType& m)
|
||||||
|
{
|
||||||
|
typedef typename MatrixType::Scalar Scalar;
|
||||||
|
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||||
|
typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
|
||||||
|
|
||||||
|
int rows = m.rows();
|
||||||
|
int cols = m.cols();
|
||||||
|
|
||||||
|
MatrixType m1 = MatrixType::Random(rows, cols),
|
||||||
|
m2 = MatrixType::Random(rows, cols);
|
||||||
|
VectorType v1 = VectorType::Random(rows),
|
||||||
|
v2 = VectorType::Random(rows);
|
||||||
|
|
||||||
|
m1 = m1.adjoint()*m1;
|
||||||
|
|
||||||
|
// col-lower
|
||||||
|
m2.setZero();
|
||||||
|
m2.template part<LowerTriangular>() = m1;
|
||||||
|
ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,LowerTriangularBit>
|
||||||
|
(cols,m2.data(),cols, v1.data(), v2.data());
|
||||||
|
VERIFY_IS_APPROX(v2, m1 * v1);
|
||||||
|
|
||||||
|
// col-upper
|
||||||
|
m2.setZero();
|
||||||
|
m2.template part<UpperTriangular>() = m1;
|
||||||
|
ei_product_selfadjoint_vector<Scalar,MatrixType::Flags&RowMajorBit,UpperTriangularBit>(cols,m2.data(),cols, v1.data(), v2.data());
|
||||||
|
VERIFY_IS_APPROX(v2, m1 * v1);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_product_selfadjoint()
|
||||||
|
{
|
||||||
|
for(int i = 0; i < g_repeat ; i++) {
|
||||||
|
CALL_SUBTEST( product_selfadjoint(Matrix<float, 1, 1>()) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(Matrix<float, 2, 2>()) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(Matrix3d()) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(MatrixXcf(4, 4)) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(MatrixXcd(21,21)) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(MatrixXd(17,17)) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(Matrix<float,Dynamic,Dynamic,RowMajor>(18,18)) );
|
||||||
|
CALL_SUBTEST( product_selfadjoint(Matrix<std::complex<double>,Dynamic,Dynamic,RowMajor>(19, 19)) );
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user