mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-07-26 06:44:27 +08:00
added a vectorized version of Product::_cacheOptimalProduct,
added the possibility to disable the vectorization using EIGEN_DONT_VECTORIZE (some architectures has SSE support by default)
This commit is contained in:
parent
613c49b475
commit
187b1543ce
@ -1,12 +1,14 @@
|
|||||||
#ifndef EIGEN_CORE_H
|
#ifndef EIGEN_CORE_H
|
||||||
#define EIGEN_CORE_H
|
#define EIGEN_CORE_H
|
||||||
|
|
||||||
|
#ifndef EIGEN_DONT_VECTORIZE
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
#define EIGEN_VECTORIZE
|
#define EIGEN_VECTORIZE
|
||||||
#define EIGEN_VECTORIZE_SSE
|
#define EIGEN_VECTORIZE_SSE
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
@ -108,7 +108,7 @@ struct ei_packet_product_unroller<RowMajor, Index, Dynamic, Lhs, Rhs, PacketScal
|
|||||||
*/
|
*/
|
||||||
template<typename Lhs, typename Rhs> struct ei_product_eval_mode
|
template<typename Lhs, typename Rhs> struct ei_product_eval_mode
|
||||||
{
|
{
|
||||||
enum{ value = Lhs::MaxRowsAtCompileTime >= 8 && Rhs::MaxColsAtCompileTime >= 8
|
enum{ value = Lhs::MaxRowsAtCompileTime >= 16 && Rhs::MaxColsAtCompileTime >= 16
|
||||||
? CacheOptimalProduct : NormalProduct };
|
? CacheOptimalProduct : NormalProduct };
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -139,7 +139,7 @@ struct ei_traits<Product<Lhs, Rhs, EvalMode> >
|
|||||||
| (
|
| (
|
||||||
(
|
(
|
||||||
!(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit)
|
!(Lhs::Flags & RowMajorBit) && (Lhs::Flags & VectorizableBit)
|
||||||
)
|
)
|
||||||
? VectorizableBit
|
? VectorizableBit
|
||||||
: (
|
: (
|
||||||
(
|
(
|
||||||
@ -215,7 +215,6 @@ template<typename Lhs, typename Rhs, int EvalMode> class Product : ei_no_assignm
|
|||||||
? Lhs::ColsAtCompileTime : Dynamic,
|
? Lhs::ColsAtCompileTime : Dynamic,
|
||||||
Lhs, Rhs, PacketScalar>
|
Lhs, Rhs, PacketScalar>
|
||||||
::run(row, col, m_lhs, m_rhs, res);
|
::run(row, col, m_lhs, m_rhs, res);
|
||||||
// std::cout << "vec unrolled product\n";
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -280,25 +279,67 @@ template<typename DestDerived>
|
|||||||
void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
|
void Product<Lhs,Rhs,EvalMode>::_cacheOptimalEval(DestDerived& res) const
|
||||||
{
|
{
|
||||||
res.setZero();
|
res.setZero();
|
||||||
const int cols4 = m_lhs.cols()&0xfffffffC;
|
const int cols4 = m_lhs.cols() & 0xfffffffC;
|
||||||
for (int k=0; k<m_rhs.cols(); ++k)
|
#ifdef EIGEN_VECTORIZE
|
||||||
|
if( (Flags & VectorizableBit) && (!(Lhs::Flags & RowMajorBit)) )
|
||||||
{
|
{
|
||||||
int j=0;
|
for(int k=0; k<m_rhs.cols(); k++)
|
||||||
for (; j<cols4; j+=4)
|
|
||||||
{
|
{
|
||||||
const Scalar tmp0 = m_rhs.coeff(j ,k);
|
int j=0;
|
||||||
const Scalar tmp1 = m_rhs.coeff(j+1,k);
|
for(; j<cols4; j+=4)
|
||||||
const Scalar tmp2 = m_rhs.coeff(j+2,k);
|
{
|
||||||
const Scalar tmp3 = m_rhs.coeff(j+3,k);
|
const typename ei_packet_traits<Scalar>::type tmp0 = ei_pset1(m_rhs.coeff(j+0,k));
|
||||||
for (int i=0; i<m_lhs.rows(); ++i)
|
const typename ei_packet_traits<Scalar>::type tmp1 = ei_pset1(m_rhs.coeff(j+1,k));
|
||||||
res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
|
const typename ei_packet_traits<Scalar>::type tmp2 = ei_pset1(m_rhs.coeff(j+2,k));
|
||||||
+ tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
|
const typename ei_packet_traits<Scalar>::type tmp3 = ei_pset1(m_rhs.coeff(j+3,k));
|
||||||
|
for (int i=0; i<m_lhs.rows(); i+=ei_packet_traits<Scalar>::size)
|
||||||
|
{
|
||||||
|
res.writePacketCoeff(i,k,
|
||||||
|
ei_padd(
|
||||||
|
res.packetCoeff(i,k),
|
||||||
|
ei_padd(
|
||||||
|
ei_padd(
|
||||||
|
ei_pmul(tmp0, m_lhs.packetCoeff(i,j)),
|
||||||
|
ei_pmul(tmp1, m_lhs.packetCoeff(i,j+1))),
|
||||||
|
ei_padd(
|
||||||
|
ei_pmul(tmp2, m_lhs.packetCoeff(i,j+2)),
|
||||||
|
ei_pmul(tmp3, m_lhs.packetCoeff(i,j+3))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(; j<m_lhs.cols(); ++j)
|
||||||
|
{
|
||||||
|
const typename ei_packet_traits<Scalar>::type tmp = ei_pset1(m_rhs.coeff(j,k));
|
||||||
|
for (int i=0; i<m_lhs.rows(); ++i)
|
||||||
|
res.writePacketCoeff(i,k,ei_pmul(tmp, m_lhs.packetCoeff(i,j)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (; j<m_lhs.cols(); ++j)
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
for(int k=0; k<m_rhs.cols(); ++k)
|
||||||
{
|
{
|
||||||
const Scalar tmp = m_rhs.coeff(j,k);
|
int j=0;
|
||||||
for (int i=0; i<m_lhs.rows(); ++i)
|
for(; j<cols4; j+=4)
|
||||||
res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
|
{
|
||||||
|
const Scalar tmp0 = m_rhs.coeff(j ,k);
|
||||||
|
const Scalar tmp1 = m_rhs.coeff(j+1,k);
|
||||||
|
const Scalar tmp2 = m_rhs.coeff(j+2,k);
|
||||||
|
const Scalar tmp3 = m_rhs.coeff(j+3,k);
|
||||||
|
for (int i=0; i<m_lhs.rows(); ++i)
|
||||||
|
res.coeffRef(i,k) += tmp0 * m_lhs.coeff(i,j) + tmp1 * m_lhs.coeff(i,j+1)
|
||||||
|
+ tmp2 * m_lhs.coeff(i,j+2) + tmp3 * m_lhs.coeff(i,j+3);
|
||||||
|
}
|
||||||
|
for(; j<m_lhs.cols(); ++j)
|
||||||
|
{
|
||||||
|
const Scalar tmp = m_rhs.coeff(j,k);
|
||||||
|
for (int i=0; i<m_lhs.rows(); ++i)
|
||||||
|
res.coeffRef(i,k) += tmp * m_lhs.coeff(i,j);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user