mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-23 23:03:15 +08:00
Merged latest updates from the parent branch
This commit is contained in:
commit
cc73164aa8
38
Eigen/src/Core/GenericPacketMath.h
Normal file → Executable file
38
Eigen/src/Core/GenericPacketMath.h
Normal file → Executable file
@ -169,6 +169,44 @@ ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
|
||||
/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(*a); }
|
||||
|
||||
/** \internal equivalent to
|
||||
* \code
|
||||
* a0 = pload1(a+0);
|
||||
* a1 = pload1(a+1);
|
||||
* a2 = pload1(a+2);
|
||||
* a3 = pload1(a+3);
|
||||
* \endcode
|
||||
* \sa pset1, pload1, ploaddup, pbroadcast2
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC
|
||||
inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
|
||||
Packet& a0, Packet& a1, Packet& a2, Packet& a3)
|
||||
{
|
||||
a0 = pload1<Packet>(a+0);
|
||||
a1 = pload1<Packet>(a+1);
|
||||
a2 = pload1<Packet>(a+2);
|
||||
a3 = pload1<Packet>(a+3);
|
||||
}
|
||||
|
||||
/** \internal equivalent to
|
||||
* \code
|
||||
* a0 = pload1(a+0);
|
||||
* a1 = pload1(a+1);
|
||||
* \endcode
|
||||
* \sa pset1, pload1, ploaddup, pbroadcast4
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC
|
||||
inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
|
||||
Packet& a0, Packet& a1)
|
||||
{
|
||||
a0 = pload1<Packet>(a+0);
|
||||
a1 = pload1<Packet>(a+1);
|
||||
}
|
||||
|
||||
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
|
||||
template<typename Scalar> inline typename packet_traits<Scalar>::type
|
||||
plset(const Scalar& a) { return a; }
|
||||
|
45
Eigen/src/Core/arch/SSE/PacketMath.h
Normal file → Executable file
45
Eigen/src/Core/arch/SSE/PacketMath.h
Normal file → Executable file
@ -114,11 +114,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
|
||||
#endif
|
||||
|
||||
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
|
||||
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
|
||||
// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
|
||||
// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
|
||||
// Also note that with AVX, we want it to generate a vbroadcastss.
|
||||
#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
|
||||
return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
|
||||
@ -392,6 +403,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
|
||||
#endif
|
||||
}
|
||||
|
||||
// with AVX, the default implementations based on pload1 are faster
|
||||
#ifndef __AVX__
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet4f>(const float *a,
|
||||
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
|
||||
{
|
||||
a3 = ploadu<Packet4f>(a);
|
||||
a0 = vec4f_swizzle1(a3, 0,0,0,0);
|
||||
a1 = vec4f_swizzle1(a3, 1,1,1,1);
|
||||
a2 = vec4f_swizzle1(a3, 2,2,2,2);
|
||||
a3 = vec4f_swizzle1(a3, 3,3,3,3);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void
|
||||
pbroadcast4<Packet2d>(const double *a,
|
||||
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
|
||||
{
|
||||
#ifdef EIGEN_VECTORIZE_SSE3
|
||||
a0 = _mm_loaddup_pd(a+0);
|
||||
a1 = _mm_loaddup_pd(a+1);
|
||||
a2 = _mm_loaddup_pd(a+2);
|
||||
a3 = _mm_loaddup_pd(a+3);
|
||||
#else
|
||||
a1 = ploadu<Packet2d>(a);
|
||||
a0 = vec2d_swizzle1(a1, 0,0);
|
||||
a1 = vec2d_swizzle1(a1, 1,1);
|
||||
a3 = ploadu<Packet2d>(a+2);
|
||||
a2 = vec2d_swizzle1(a3, 0,0);
|
||||
a3 = vec2d_swizzle1(a3, 1,1);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
|
||||
{
|
||||
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -81,9 +81,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
Index threads = omp_get_num_threads();
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
|
||||
|
||||
RhsScalar* blockB = blocking.blockB();
|
||||
eigen_internal_assert(blockB!=0);
|
||||
@ -122,7 +120,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
if(shift>0)
|
||||
while(info[j].sync!=k) {}
|
||||
|
||||
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
|
||||
gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0);
|
||||
}
|
||||
|
||||
// Then keep going as usual with the remaining A'
|
||||
@ -134,7 +132,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
// C_i += A' * B'
|
||||
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
|
||||
gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0);
|
||||
}
|
||||
|
||||
// Release all the sub blocks B'_j of B' for the current thread,
|
||||
@ -152,11 +150,9 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// this is the sequential version!
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
// (==GEMM_VAR1)
|
||||
@ -182,7 +178,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
// Everything is packed, we can now call the block * panel kernel:
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -73,11 +73,8 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
if(mc > Traits::nr)
|
||||
mc = (mc/Traits::nr)*Traits::nr;
|
||||
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
std::size_t sizeB = sizeW + kc*size;
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
|
||||
RhsScalar* blockB = allocatedBlockB + sizeW;
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
@ -103,15 +100,15 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
// 3 - after the diagonal => processed with gebp or skipped
|
||||
if (UpLo==Lower)
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
|
||||
-1, -1, 0, 0, allocatedBlockB);
|
||||
-1, -1, 0, 0);
|
||||
|
||||
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
|
||||
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
if (UpLo==Upper)
|
||||
{
|
||||
Index j2 = i2+actual_mc;
|
||||
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
|
||||
-1, -1, 0, 0, allocatedBlockB);
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -136,7 +133,7 @@ struct tribb_kernel
|
||||
enum {
|
||||
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
|
||||
};
|
||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
|
||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
{
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||
@ -150,7 +147,7 @@ struct tribb_kernel
|
||||
|
||||
if(UpLo==Upper)
|
||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0, workspace);
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// selfadjoint micro block
|
||||
{
|
||||
@ -158,7 +155,7 @@ struct tribb_kernel
|
||||
buffer.setZero();
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0, workspace);
|
||||
-1, -1, 0, 0);
|
||||
// 2 - triangular accumulation
|
||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
@ -173,7 +170,7 @@ struct tribb_kernel
|
||||
{
|
||||
Index i = j+actualBlockSize;
|
||||
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0, workspace);
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -63,7 +63,7 @@ struct symm_pack_lhs
|
||||
for(Index i=peeled_mc; i<rows; i++)
|
||||
{
|
||||
for(Index k=0; k<i; k++)
|
||||
blockA[count++] = lhs(i, k); // normal
|
||||
blockA[count++] = lhs(i, k); // normal
|
||||
|
||||
blockA[count++] = numext::real(lhs(i, i)); // real (diagonal)
|
||||
|
||||
@ -91,11 +91,18 @@ struct symm_pack_rhs
|
||||
{
|
||||
blockB[count+0] = rhs(k,j2+0);
|
||||
blockB[count+1] = rhs(k,j2+1);
|
||||
if (nr==4)
|
||||
if (nr>=4)
|
||||
{
|
||||
blockB[count+2] = rhs(k,j2+2);
|
||||
blockB[count+3] = rhs(k,j2+3);
|
||||
}
|
||||
if (nr>=8)
|
||||
{
|
||||
blockB[count+4] = rhs(k,j2+4);
|
||||
blockB[count+5] = rhs(k,j2+5);
|
||||
blockB[count+6] = rhs(k,j2+6);
|
||||
blockB[count+7] = rhs(k,j2+7);
|
||||
}
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
@ -109,11 +116,18 @@ struct symm_pack_rhs
|
||||
{
|
||||
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||
if (nr==4)
|
||||
if (nr>=4)
|
||||
{
|
||||
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||
}
|
||||
if (nr>=8)
|
||||
{
|
||||
blockB[count+4] = numext::conj(rhs(j2+4,k));
|
||||
blockB[count+5] = numext::conj(rhs(j2+5,k));
|
||||
blockB[count+6] = numext::conj(rhs(j2+6,k));
|
||||
blockB[count+7] = numext::conj(rhs(j2+7,k));
|
||||
}
|
||||
count += nr;
|
||||
}
|
||||
// symmetric
|
||||
@ -137,11 +151,18 @@ struct symm_pack_rhs
|
||||
{
|
||||
blockB[count+0] = rhs(k,j2+0);
|
||||
blockB[count+1] = rhs(k,j2+1);
|
||||
if (nr==4)
|
||||
if (nr>=4)
|
||||
{
|
||||
blockB[count+2] = rhs(k,j2+2);
|
||||
blockB[count+3] = rhs(k,j2+3);
|
||||
}
|
||||
if (nr>=8)
|
||||
{
|
||||
blockB[count+4] = rhs(k,j2+4);
|
||||
blockB[count+5] = rhs(k,j2+5);
|
||||
blockB[count+6] = rhs(k,j2+6);
|
||||
blockB[count+7] = rhs(k,j2+7);
|
||||
}
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
@ -153,11 +174,18 @@ struct symm_pack_rhs
|
||||
{
|
||||
blockB[count+0] = numext::conj(rhs(j2+0,k));
|
||||
blockB[count+1] = numext::conj(rhs(j2+1,k));
|
||||
if (nr==4)
|
||||
if (nr>=4)
|
||||
{
|
||||
blockB[count+2] = numext::conj(rhs(j2+2,k));
|
||||
blockB[count+3] = numext::conj(rhs(j2+3,k));
|
||||
}
|
||||
if (nr>=8)
|
||||
{
|
||||
blockB[count+4] = numext::conj(rhs(j2+4,k));
|
||||
blockB[count+5] = numext::conj(rhs(j2+5,k));
|
||||
blockB[count+6] = numext::conj(rhs(j2+6,k));
|
||||
blockB[count+7] = numext::conj(rhs(j2+7,k));
|
||||
}
|
||||
count += nr;
|
||||
}
|
||||
}
|
||||
@ -422,11 +450,11 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
|
||||
NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
|
||||
internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor>
|
||||
::run(
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
actualAlpha // alpha
|
||||
lhs.rows(), rhs.cols(), // sizes
|
||||
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
|
||||
&rhs.coeffRef(0,0), rhs.outerStride(), // rhs info
|
||||
&dst.coeffRef(0,0), dst.outerStride(), // result info
|
||||
actualAlpha // alpha
|
||||
);
|
||||
}
|
||||
};
|
||||
|
@ -125,11 +125,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
@ -187,7 +185,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
|
||||
|
||||
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
|
||||
// GEBP with remaining micro panel
|
||||
if (lengthTarget>0)
|
||||
@ -197,7 +195,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -211,7 +209,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -266,11 +264,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
||||
|
||||
Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
|
||||
triangularBuffer.setZero();
|
||||
@ -357,14 +353,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
alpha,
|
||||
actual_kc, actual_kc, // strides
|
||||
blockOffset, blockOffset,// offsets
|
||||
blockW); // workspace
|
||||
blockOffset, blockOffset);// offsets
|
||||
}
|
||||
}
|
||||
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
||||
blockA, geb, actual_mc, actual_kc, rs,
|
||||
alpha,
|
||||
-1, -1, 0, 0, blockW);
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -66,11 +66,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*cols;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
@ -158,7 +156,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -174,7 +172,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
{
|
||||
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
|
||||
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -215,11 +213,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
|
||||
std::size_t sizeA = kc*mc;
|
||||
std::size_t sizeB = kc*size;
|
||||
std::size_t sizeW = kc*Traits::WorkSpaceFactor;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
@ -285,8 +281,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
Scalar(-1),
|
||||
actual_kc, actual_kc, // strides
|
||||
panelOffset, panelOffset, // offsets
|
||||
blockW); // workspace
|
||||
panelOffset, panelOffset); // offsets
|
||||
}
|
||||
|
||||
// unblocked triangular solve
|
||||
@ -317,7 +312,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
if (rs>0)
|
||||
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
|
||||
actual_mc, actual_kc, rs, Scalar(-1),
|
||||
-1, -1, 0, 0, blockW);
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -700,98 +700,42 @@ template<typename T> class aligned_stack_memory_handler
|
||||
* \sa \ref TopicStlContainers.
|
||||
*/
|
||||
template<class T>
|
||||
class aligned_allocator
|
||||
class aligned_allocator : public std::allocator<T>
|
||||
{
|
||||
public:
|
||||
typedef size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
typedef T& reference;
|
||||
typedef const T& const_reference;
|
||||
typedef T value_type;
|
||||
typedef size_t size_type;
|
||||
typedef std::ptrdiff_t difference_type;
|
||||
typedef T* pointer;
|
||||
typedef const T* const_pointer;
|
||||
typedef T& reference;
|
||||
typedef const T& const_reference;
|
||||
typedef T value_type;
|
||||
|
||||
template<class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef aligned_allocator<U> other;
|
||||
};
|
||||
template<class U>
|
||||
struct rebind
|
||||
{
|
||||
typedef aligned_allocator<U> other;
|
||||
};
|
||||
|
||||
pointer address( reference value ) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
aligned_allocator() : std::allocator<T>() {}
|
||||
|
||||
const_pointer address( const_reference value ) const
|
||||
{
|
||||
return &value;
|
||||
}
|
||||
aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
|
||||
|
||||
aligned_allocator()
|
||||
{
|
||||
}
|
||||
template<class U>
|
||||
aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
|
||||
|
||||
aligned_allocator( const aligned_allocator& )
|
||||
{
|
||||
}
|
||||
~aligned_allocator() {}
|
||||
|
||||
template<class U>
|
||||
aligned_allocator( const aligned_allocator<U>& )
|
||||
{
|
||||
}
|
||||
pointer allocate(size_type num, const void* /*hint*/ = 0)
|
||||
{
|
||||
internal::check_size_for_overflow<T>(num);
|
||||
return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
|
||||
}
|
||||
|
||||
~aligned_allocator()
|
||||
{
|
||||
}
|
||||
|
||||
size_type max_size() const
|
||||
{
|
||||
return (std::numeric_limits<size_type>::max)();
|
||||
}
|
||||
|
||||
pointer allocate( size_type num, const void* hint = 0 )
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(hint);
|
||||
internal::check_size_for_overflow<T>(num);
|
||||
return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
|
||||
}
|
||||
|
||||
void construct( pointer p, const T& value )
|
||||
{
|
||||
::new( p ) T( value );
|
||||
}
|
||||
|
||||
#if (__cplusplus >= 201103L)
|
||||
template <typename U, typename... Args>
|
||||
void construct( U* u, Args&&... args)
|
||||
{
|
||||
::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... );
|
||||
}
|
||||
#endif
|
||||
|
||||
void destroy( pointer p )
|
||||
{
|
||||
p->~T();
|
||||
}
|
||||
|
||||
#if (__cplusplus >= 201103L)
|
||||
template <typename U>
|
||||
void destroy( U* u )
|
||||
{
|
||||
u->~U();
|
||||
}
|
||||
#endif
|
||||
|
||||
void deallocate( pointer p, size_type /*num*/ )
|
||||
{
|
||||
internal::aligned_free( p );
|
||||
}
|
||||
|
||||
bool operator!=(const aligned_allocator<T>& ) const
|
||||
{ return false; }
|
||||
|
||||
bool operator==(const aligned_allocator<T>& ) const
|
||||
{ return true; }
|
||||
void deallocate(pointer p, size_type /*num*/)
|
||||
{
|
||||
internal::aligned_free(p);
|
||||
}
|
||||
};
|
||||
|
||||
//---------- Cache sizes ----------
|
||||
|
@ -165,8 +165,8 @@ AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived
|
||||
Scalar n2 = q.vec().squaredNorm();
|
||||
if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
|
||||
{
|
||||
m_angle = 0;
|
||||
m_axis << 1, 0, 0;
|
||||
m_angle = Scalar(0);
|
||||
m_axis << Scalar(1), Scalar(0), Scalar(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -48,7 +48,7 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec
|
||||
typedef typename MatrixType::Index Index;
|
||||
enum { TFactorSize = MatrixType::ColsAtCompileTime };
|
||||
Index nbVecs = vectors.cols();
|
||||
Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs);
|
||||
Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
|
||||
make_block_householder_triangular_factor(T, vectors, hCoeffs);
|
||||
|
||||
const TriangularView<const VectorsType, UnitLower>& V(vectors);
|
||||
|
Loading…
x
Reference in New Issue
Block a user