mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-09-24 07:13:16 +08:00
Remove useless register keyword, and optimize predux_min/max for SSE4
This commit is contained in:
parent
6cf938df53
commit
a7621809fe
@ -504,13 +504,18 @@ template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
|
|||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
|
Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
||||||
|
return pfirst(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
||||||
|
#else
|
||||||
// after some experiments, it is seems this is the fastest way to implement it
|
// after some experiments, it is seems this is the fastest way to implement it
|
||||||
// for GCC (eg., it does not like using std::min after the pstore !!)
|
// for GCC (eg., it does not like using std::min after the pstore !!)
|
||||||
EIGEN_ALIGN16 int aux[4];
|
EIGEN_ALIGN16 int aux[4];
|
||||||
pstore(aux, a);
|
pstore(aux, a);
|
||||||
register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
|
int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
|
||||||
register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
|
int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
|
||||||
return aux0<aux2 ? aux0 : aux2;
|
return aux0<aux2 ? aux0 : aux2;
|
||||||
|
#endif // EIGEN_VECTORIZE_SSE4_1
|
||||||
}
|
}
|
||||||
|
|
||||||
// max
|
// max
|
||||||
@ -525,13 +530,18 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
|
|||||||
}
|
}
|
||||||
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
|
Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
|
||||||
|
return pfirst(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
|
||||||
|
#else
|
||||||
// after some experiments, it is seems this is the fastest way to implement it
|
// after some experiments, it is seems this is the fastest way to implement it
|
||||||
// for GCC (eg., it does not like using std::min after the pstore !!)
|
// for GCC (eg., it does not like using std::min after the pstore !!)
|
||||||
EIGEN_ALIGN16 int aux[4];
|
EIGEN_ALIGN16 int aux[4];
|
||||||
pstore(aux, a);
|
pstore(aux, a);
|
||||||
register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
|
int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
|
||||||
register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
|
int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
|
||||||
return aux0>aux2 ? aux0 : aux2;
|
return aux0>aux2 ? aux0 : aux2;
|
||||||
|
#endif // EIGEN_VECTORIZE_SSE4_1
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (defined __GNUC__)
|
#if (defined __GNUC__)
|
||||||
|
@ -79,8 +79,8 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||||||
for (Index j=FirstTriangular ? bound : 0;
|
for (Index j=FirstTriangular ? bound : 0;
|
||||||
j<(FirstTriangular ? size : bound);j+=2)
|
j<(FirstTriangular ? size : bound);j+=2)
|
||||||
{
|
{
|
||||||
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
|
const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
|
||||||
register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
|
const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
|
||||||
|
|
||||||
Scalar t0 = cjAlpha * rhs[j];
|
Scalar t0 = cjAlpha * rhs[j];
|
||||||
Packet ptmp0 = pset1<Packet>(t0);
|
Packet ptmp0 = pset1<Packet>(t0);
|
||||||
@ -147,7 +147,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
|
|||||||
}
|
}
|
||||||
for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
|
for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
|
||||||
{
|
{
|
||||||
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
|
const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
|
||||||
|
|
||||||
Scalar t1 = cjAlpha * rhs[j];
|
Scalar t1 = cjAlpha * rhs[j];
|
||||||
Scalar t2(0);
|
Scalar t2(0);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user