mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-08-10 10:49:04 +08:00
Merge with default.
This commit is contained in:
commit
84a1101b36
@ -172,6 +172,8 @@ public:
|
|||||||
EIGEN_DEBUG_VAR(MaySliceVectorize)
|
EIGEN_DEBUG_VAR(MaySliceVectorize)
|
||||||
std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
|
std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
|
||||||
EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
|
EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
|
||||||
|
EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
|
||||||
|
EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
|
||||||
EIGEN_DEBUG_VAR(UnrollingLimit)
|
EIGEN_DEBUG_VAR(UnrollingLimit)
|
||||||
EIGEN_DEBUG_VAR(MayUnrollCompletely)
|
EIGEN_DEBUG_VAR(MayUnrollCompletely)
|
||||||
EIGEN_DEBUG_VAR(MayUnrollInner)
|
EIGEN_DEBUG_VAR(MayUnrollInner)
|
||||||
|
@ -803,13 +803,21 @@ public:
|
|||||||
|
|
||||||
MatrixFlags = evaluator<MatrixType>::Flags,
|
MatrixFlags = evaluator<MatrixType>::Flags,
|
||||||
DiagFlags = evaluator<DiagonalType>::Flags,
|
DiagFlags = evaluator<DiagonalType>::Flags,
|
||||||
_StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
|
|
||||||
|
_StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
|
||||||
|
: (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
|
||||||
|
: MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
|
||||||
|
_SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
|
||||||
|
|
||||||
_ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
|
_ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
|
||||||
||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
|
||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
|
||||||
_SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
|
_SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
|
||||||
// FIXME currently we need same types, but in the future the next rule should be the one
|
// FIXME currently we need same types, but in the future the next rule should be the one
|
||||||
//_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
|
//_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
|
||||||
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
|
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit)
|
||||||
|
&& _SameTypes
|
||||||
|
&& (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
|
||||||
|
&& (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
|
||||||
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
|
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
|
||||||
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
|
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
|
||||||
Alignment = evaluator<MatrixType>::Alignment,
|
Alignment = evaluator<MatrixType>::Alignment,
|
||||||
@ -870,10 +878,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
|||||||
|
|
||||||
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
||||||
typedef typename XprType::PlainObject PlainObject;
|
typedef typename XprType::PlainObject PlainObject;
|
||||||
|
typedef typename Lhs::DiagonalVectorType DiagonalType;
|
||||||
|
|
||||||
|
|
||||||
enum {
|
enum { StorageOrder = Base::_StorageOrder };
|
||||||
StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
|
|
||||||
};
|
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
|
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
|
||||||
: Base(xpr.rhs(), xpr.lhs().diagonal())
|
: Base(xpr.rhs(), xpr.lhs().diagonal())
|
||||||
@ -917,7 +925,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
|||||||
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
typedef Product<Lhs, Rhs, ProductKind> XprType;
|
||||||
typedef typename XprType::PlainObject PlainObject;
|
typedef typename XprType::PlainObject PlainObject;
|
||||||
|
|
||||||
enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
|
enum { StorageOrder = Base::_StorageOrder };
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
|
EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
|
||||||
: Base(xpr.lhs(), xpr.rhs().diagonal())
|
: Base(xpr.lhs(), xpr.rhs().diagonal())
|
||||||
|
@ -32,14 +32,20 @@ public:
|
|||||||
PacketSize = unpacket_traits<PacketType>::size,
|
PacketSize = unpacket_traits<PacketType>::size,
|
||||||
InnerMaxSize = int(Evaluator::IsRowMajor)
|
InnerMaxSize = int(Evaluator::IsRowMajor)
|
||||||
? Evaluator::MaxColsAtCompileTime
|
? Evaluator::MaxColsAtCompileTime
|
||||||
: Evaluator::MaxRowsAtCompileTime
|
: Evaluator::MaxRowsAtCompileTime,
|
||||||
|
OuterMaxSize = int(Evaluator::IsRowMajor)
|
||||||
|
? Evaluator::MaxRowsAtCompileTime
|
||||||
|
: Evaluator::MaxColsAtCompileTime,
|
||||||
|
SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
|
||||||
|
: int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
|
||||||
|
: (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
|
||||||
};
|
};
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
|
MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
|
||||||
&& (functor_traits<Func>::PacketAccess),
|
&& (functor_traits<Func>::PacketAccess),
|
||||||
MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
|
MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
|
||||||
MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
|
MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -69,13 +75,15 @@ public:
|
|||||||
EIGEN_DEBUG_VAR(Evaluator::Flags)
|
EIGEN_DEBUG_VAR(Evaluator::Flags)
|
||||||
std::cerr.unsetf(std::ios::hex);
|
std::cerr.unsetf(std::ios::hex);
|
||||||
EIGEN_DEBUG_VAR(InnerMaxSize)
|
EIGEN_DEBUG_VAR(InnerMaxSize)
|
||||||
|
EIGEN_DEBUG_VAR(OuterMaxSize)
|
||||||
|
EIGEN_DEBUG_VAR(SliceVectorizedWork)
|
||||||
EIGEN_DEBUG_VAR(PacketSize)
|
EIGEN_DEBUG_VAR(PacketSize)
|
||||||
EIGEN_DEBUG_VAR(MightVectorize)
|
EIGEN_DEBUG_VAR(MightVectorize)
|
||||||
EIGEN_DEBUG_VAR(MayLinearVectorize)
|
EIGEN_DEBUG_VAR(MayLinearVectorize)
|
||||||
EIGEN_DEBUG_VAR(MaySliceVectorize)
|
EIGEN_DEBUG_VAR(MaySliceVectorize)
|
||||||
EIGEN_DEBUG_VAR(Traversal)
|
std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
|
||||||
EIGEN_DEBUG_VAR(UnrollingLimit)
|
EIGEN_DEBUG_VAR(UnrollingLimit)
|
||||||
EIGEN_DEBUG_VAR(Unrolling)
|
std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
|
||||||
std::cerr << std::endl;
|
std::cerr << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -402,7 +410,7 @@ DenseBase<Derived>::redux(const Func& func) const
|
|||||||
|
|
||||||
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
|
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
|
||||||
ThisEvaluator thisEval(derived());
|
ThisEvaluator thisEval(derived());
|
||||||
|
|
||||||
// The initial expression is passed to the reducer as an additional argument instead of
|
// The initial expression is passed to the reducer as an additional argument instead of
|
||||||
// passing it as a member of redux_evaluator to help
|
// passing it as a member of redux_evaluator to help
|
||||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
|
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
|
||||||
|
@ -30,7 +30,14 @@ static int nb_storeu;
|
|||||||
|
|
||||||
EIGEN_DECLARE_TEST(unalignedcount)
|
EIGEN_DECLARE_TEST(unalignedcount)
|
||||||
{
|
{
|
||||||
#if defined(EIGEN_VECTORIZE_AVX)
|
#if defined(EIGEN_VECTORIZE_AVX512)
|
||||||
|
VectorXf a(48), b(48);
|
||||||
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 6, 0, 3, 0);
|
||||||
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) += b.segment(0,48), 3, 3, 3, 0);
|
||||||
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) -= b.segment(0,48), 3, 3, 3, 0);
|
||||||
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) *= 3.5, 3, 0, 3, 0);
|
||||||
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,48) /= 3.5, 3, 0, 3, 0);
|
||||||
|
#elif defined(EIGEN_VECTORIZE_AVX)
|
||||||
VectorXf a(40), b(40);
|
VectorXf a(40), b(40);
|
||||||
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
|
||||||
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
|
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
|
||||||
|
@ -37,6 +37,7 @@ using internal::demangle_unrolling;
|
|||||||
template<typename Dst, typename Src>
|
template<typename Dst, typename Src>
|
||||||
bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
|
bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
|
||||||
{
|
{
|
||||||
|
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
|
||||||
typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
|
typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
|
||||||
bool res = traits::Traversal==traversal;
|
bool res = traits::Traversal==traversal;
|
||||||
if(unrolling==InnerUnrolling+CompleteUnrolling)
|
if(unrolling==InnerUnrolling+CompleteUnrolling)
|
||||||
@ -61,6 +62,7 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
|
|||||||
template<typename Dst, typename Src>
|
template<typename Dst, typename Src>
|
||||||
bool test_assign(int traversal, int unrolling)
|
bool test_assign(int traversal, int unrolling)
|
||||||
{
|
{
|
||||||
|
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
|
||||||
typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
|
typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
|
||||||
bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
|
bool res = traits::Traversal==traversal && traits::Unrolling==unrolling;
|
||||||
if(!res)
|
if(!res)
|
||||||
@ -117,26 +119,26 @@ struct vectorization_logic
|
|||||||
typedef Matrix<Scalar,Dynamic,1> VectorX;
|
typedef Matrix<Scalar,Dynamic,1> VectorX;
|
||||||
typedef Matrix<Scalar,Dynamic,Dynamic> MatrixXX;
|
typedef Matrix<Scalar,Dynamic,Dynamic> MatrixXX;
|
||||||
typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
|
typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
|
||||||
typedef Matrix<Scalar,2*PacketSize,2*PacketSize> Matrix22;
|
typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?8:2*PacketSize,(Matrix11::Flags&RowMajorBit)?2*PacketSize:8> Matrix22;
|
||||||
typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
|
typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16> Matrix44;
|
||||||
typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
|
typedef Matrix<Scalar,(Matrix11::Flags&RowMajorBit)?16:4*PacketSize,(Matrix11::Flags&RowMajorBit)?4*PacketSize:16,DontAlign|EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION> Matrix44u;
|
||||||
typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
|
typedef Matrix<Scalar,4*PacketSize,4*PacketSize,ColMajor> Matrix44c;
|
||||||
typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
|
typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
|
||||||
|
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
|
(PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
|
||||||
> Matrix1;
|
> Matrix1;
|
||||||
|
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
|
||||||
DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
|
DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
|
||||||
|
|
||||||
// this type is made such that it can only be vectorized when viewed as a linear 1D vector
|
// this type is made such that it can only be vectorized when viewed as a linear 1D vector
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
|
(PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
|
||||||
> Matrix3;
|
> Matrix3;
|
||||||
|
|
||||||
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
|
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
|
||||||
@ -202,7 +204,7 @@ struct vectorization_logic
|
|||||||
VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
|
VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
|
||||||
|
|
||||||
|
|
||||||
VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
|
VERIFY(test_assign(Matrix11(),Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,21,21>().template block<PacketSize,PacketSize>(3,2),
|
||||||
(EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
|
(EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, CompleteUnrolling|InnerUnrolling));
|
||||||
|
|
||||||
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
||||||
@ -230,8 +232,13 @@ struct vectorization_logic
|
|||||||
VERIFY(test_redux(Matrix44(),
|
VERIFY(test_redux(Matrix44(),
|
||||||
LinearVectorizedTraversal,NoUnrolling));
|
LinearVectorizedTraversal,NoUnrolling));
|
||||||
|
|
||||||
VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2),
|
if(PacketSize>1) {
|
||||||
DefaultTraversal,CompleteUnrolling));
|
VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?4:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:4>(1,2),
|
||||||
|
SliceVectorizedTraversal,CompleteUnrolling));
|
||||||
|
|
||||||
|
VERIFY(test_redux(Matrix44().template block<(Matrix1::Flags&RowMajorBit)?2:PacketSize,(Matrix1::Flags&RowMajorBit)?PacketSize:2>(1,2),
|
||||||
|
DefaultTraversal,CompleteUnrolling));
|
||||||
|
}
|
||||||
|
|
||||||
VERIFY(test_redux(Matrix44c().template block<2*PacketSize,1>(1,2),
|
VERIFY(test_redux(Matrix44c().template block<2*PacketSize,1>(1,2),
|
||||||
LinearVectorizedTraversal,CompleteUnrolling));
|
LinearVectorizedTraversal,CompleteUnrolling));
|
||||||
@ -289,19 +296,19 @@ struct vectorization_logic_half
|
|||||||
// typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
|
// typedef Matrix<Scalar,4*PacketSize,4*PacketSize,RowMajor> Matrix44r;
|
||||||
|
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
|
(PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1)
|
||||||
> Matrix1;
|
> Matrix1;
|
||||||
|
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 2 : PacketSize==8 ? 2 : PacketSize==4 ? 2 : PacketSize==2 ? 2 : /*PacketSize==1 ?*/ 1),
|
||||||
DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
|
DontAlign|((Matrix1::Flags&RowMajorBit)?RowMajor:ColMajor)> Matrix1u;
|
||||||
|
|
||||||
// this type is made such that it can only be vectorized when viewed as a linear 1D vector
|
// this type is made such that it can only be vectorized when viewed as a linear 1D vector
|
||||||
typedef Matrix<Scalar,
|
typedef Matrix<Scalar,
|
||||||
(PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
|
(PacketSize==16 ? 4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
|
||||||
(PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
|
(PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
|
||||||
> Matrix3;
|
> Matrix3;
|
||||||
|
|
||||||
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
|
#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
|
||||||
@ -354,7 +361,8 @@ struct vectorization_logic_half
|
|||||||
NoUnrolling));
|
NoUnrolling));
|
||||||
|
|
||||||
VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
|
VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
|
||||||
EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling));
|
EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling));
|
||||||
|
|
||||||
|
|
||||||
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
VERIFY(test_assign(Vector1(),Matrix11()*Vector1(),
|
||||||
InnerVectorizedTraversal,CompleteUnrolling));
|
InnerVectorizedTraversal,CompleteUnrolling));
|
||||||
@ -375,16 +383,21 @@ struct vectorization_logic_half
|
|||||||
VERIFY(test_redux(Matrix35(),
|
VERIFY(test_redux(Matrix35(),
|
||||||
LinearVectorizedTraversal,CompleteUnrolling));
|
LinearVectorizedTraversal,CompleteUnrolling));
|
||||||
|
|
||||||
VERIFY(test_redux(Matrix57().template block<PacketSize,3>(1,0),
|
VERIFY(test_redux(Matrix57().template block<PacketSize==1?2:PacketSize,3>(1,0),
|
||||||
DefaultTraversal,CompleteUnrolling));
|
SliceVectorizedTraversal,CompleteUnrolling));
|
||||||
|
|
||||||
|
if(PacketSize>1) {
|
||||||
|
VERIFY(test_redux(Matrix57().template block<PacketSize,2>(1,0),
|
||||||
|
DefaultTraversal,CompleteUnrolling));
|
||||||
|
}
|
||||||
|
|
||||||
VERIFY((test_assign<
|
VERIFY((test_assign<
|
||||||
Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
|
Map<Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>, AlignedMax, InnerStride<3*PacketSize> >,
|
||||||
Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
|
Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
|
||||||
>(DefaultTraversal,CompleteUnrolling)));
|
>(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)));
|
||||||
|
|
||||||
VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
|
VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
|
||||||
InnerVectorizedTraversal, InnerUnrolling|CompleteUnrolling)));
|
InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling)));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -255,7 +255,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
|
|||||||
|
|
||||||
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
|
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
|
||||||
const Index first = indexPair.first;
|
const Index first = indexPair.first;
|
||||||
const Index last = indexPair.second;
|
const Index lastIdx = indexPair.second;
|
||||||
|
|
||||||
// We can always do optimized packet reads from left hand side right now, because
|
// We can always do optimized packet reads from left hand side right now, because
|
||||||
// the vertical matrix dimension on the left hand side is never contracting.
|
// the vertical matrix dimension on the left hand side is never contracting.
|
||||||
@ -263,7 +263,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
|
|||||||
// been shuffled first.
|
// been shuffled first.
|
||||||
if (Tensor::PacketAccess &&
|
if (Tensor::PacketAccess &&
|
||||||
(side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
|
(side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
|
||||||
(last - first) == (packet_size - 1)) {
|
(lastIdx - first) == (packet_size - 1)) {
|
||||||
|
|
||||||
return this->m_tensor.template packet<AlignmentType>(first);
|
return this->m_tensor.template packet<AlignmentType>(first);
|
||||||
}
|
}
|
||||||
@ -276,7 +276,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
|
|||||||
data[k] = this->m_tensor.coeff(internal_pair.first);
|
data[k] = this->m_tensor.coeff(internal_pair.first);
|
||||||
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
|
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
|
||||||
}
|
}
|
||||||
data[packet_size - 1] = this->m_tensor.coeff(last);
|
data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
|
||||||
|
|
||||||
return pload<PacketT>(data);
|
return pload<PacketT>(data);
|
||||||
}
|
}
|
||||||
|
@ -213,17 +213,17 @@ struct ThreadPoolDevice {
|
|||||||
// block_count leaves that do actual computations.
|
// block_count leaves that do actual computations.
|
||||||
Barrier barrier(static_cast<unsigned int>(block_count));
|
Barrier barrier(static_cast<unsigned int>(block_count));
|
||||||
std::function<void(Index, Index)> handleRange;
|
std::function<void(Index, Index)> handleRange;
|
||||||
handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
|
handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, Index lastIdx) {
|
||||||
if (last - first <= block_size) {
|
if (lastIdx - firstIdx <= block_size) {
|
||||||
// Single block or less, execute directly.
|
// Single block or less, execute directly.
|
||||||
f(first, last);
|
f(firstIdx, lastIdx);
|
||||||
barrier.Notify();
|
barrier.Notify();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Split into halves and submit to the pool.
|
// Split into halves and submit to the pool.
|
||||||
Index mid = first + divup((last - first) / 2, block_size) * block_size;
|
Index mid = firstIdx + divup((lastIdx - firstIdx) / 2, block_size) * block_size;
|
||||||
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
|
pool_->Schedule([=, &handleRange]() { handleRange(mid, lastIdx); });
|
||||||
handleRange(first, mid);
|
handleRange(firstIdx, mid);
|
||||||
};
|
};
|
||||||
handleRange(0, n);
|
handleRange(0, n);
|
||||||
barrier.Wait();
|
barrier.Wait();
|
||||||
|
@ -165,11 +165,11 @@ class TensorExecutor<Expression, DefaultDevice, Vectorizable,
|
|||||||
#ifdef EIGEN_USE_THREADS
|
#ifdef EIGEN_USE_THREADS
|
||||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
struct EvalRange {
|
struct EvalRange {
|
||||||
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
|
||||||
const StorageIndex last) {
|
const StorageIndex lastIdx) {
|
||||||
Evaluator evaluator = *evaluator_in;
|
Evaluator evaluator = *evaluator_in;
|
||||||
eigen_assert(last >= first);
|
eigen_assert(lastIdx >= firstIdx);
|
||||||
for (StorageIndex i = first; i < last; ++i) {
|
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -182,14 +182,14 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
|
|||||||
static const int PacketSize =
|
static const int PacketSize =
|
||||||
unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
|
|
||||||
static void run(Evaluator* evaluator_in, const StorageIndex first,
|
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
|
||||||
const StorageIndex last) {
|
const StorageIndex lastIdx) {
|
||||||
Evaluator evaluator = *evaluator_in;
|
Evaluator evaluator = *evaluator_in;
|
||||||
eigen_assert(last >= first);
|
eigen_assert(lastIdx >= firstIdx);
|
||||||
StorageIndex i = first;
|
StorageIndex i = firstIdx;
|
||||||
if (last - first >= PacketSize) {
|
if (lastIdx - firstIdx >= PacketSize) {
|
||||||
eigen_assert(first % PacketSize == 0);
|
eigen_assert(firstIdx % PacketSize == 0);
|
||||||
StorageIndex last_chunk_offset = last - 4 * PacketSize;
|
StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
|
||||||
// Give compiler a strong possibility to unroll the loop. But don't insist
|
// Give compiler a strong possibility to unroll the loop. But don't insist
|
||||||
// on unrolling, because if the function is expensive compiler should not
|
// on unrolling, because if the function is expensive compiler should not
|
||||||
// unroll the loop at the expense of inlining.
|
// unroll the loop at the expense of inlining.
|
||||||
@ -198,12 +198,12 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
|
|||||||
evaluator.evalPacket(i + j * PacketSize);
|
evaluator.evalPacket(i + j * PacketSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
last_chunk_offset = last - PacketSize;
|
last_chunk_offset = lastIdx - PacketSize;
|
||||||
for (; i <= last_chunk_offset; i += PacketSize) {
|
for (; i <= last_chunk_offset; i += PacketSize) {
|
||||||
evaluator.evalPacket(i);
|
evaluator.evalPacket(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (; i < last; ++i) {
|
for (; i < lastIdx; ++i) {
|
||||||
evaluator.evalScalar(i);
|
evaluator.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -234,8 +234,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tileable> {
|
|||||||
const StorageIndex size = array_prod(evaluator.dimensions());
|
const StorageIndex size = array_prod(evaluator.dimensions());
|
||||||
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
||||||
EvalRange::alignBlockSize,
|
EvalRange::alignBlockSize,
|
||||||
[&evaluator](StorageIndex first, StorageIndex last) {
|
[&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
|
||||||
EvalRange::run(&evaluator, first, last);
|
EvalRange::run(&evaluator, firstIdx, lastIdx);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
evaluator.cleanup();
|
evaluator.cleanup();
|
||||||
@ -292,8 +292,8 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
void* buf = device.allocate((num_threads + 1) * aligned_blocksize);
|
||||||
device.parallelFor(
|
device.parallelFor(
|
||||||
block_mapper.total_block_count(), cost * block_size,
|
block_mapper.total_block_count(), cost * block_size,
|
||||||
[=, &device, &evaluator, &block_mapper](StorageIndex first,
|
[=, &device, &evaluator, &block_mapper](StorageIndex firstIdx,
|
||||||
StorageIndex last) {
|
StorageIndex lastIdx) {
|
||||||
// currentThreadId() returns -1 if called from a thread not in the
|
// currentThreadId() returns -1 if called from a thread not in the
|
||||||
// thread pool, such as the main thread dispatching Eigen
|
// thread pool, such as the main thread dispatching Eigen
|
||||||
// expressions.
|
// expressions.
|
||||||
@ -301,7 +301,7 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, /*Tileable*/ tr
|
|||||||
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
eigen_assert(thread_idx >= -1 && thread_idx < num_threads);
|
||||||
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
Scalar* thread_buf = reinterpret_cast<Scalar*>(
|
||||||
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
static_cast<char*>(buf) + aligned_blocksize * (thread_idx + 1));
|
||||||
for (StorageIndex i = first; i < last; ++i) {
|
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
|
||||||
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
auto block = block_mapper.GetBlockForIndex(i, thread_buf);
|
||||||
evaluator.evalBlock(&block);
|
evaluator.evalBlock(&block);
|
||||||
}
|
}
|
||||||
@ -330,8 +330,8 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable, Tileable> {
|
|||||||
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
|
||||||
struct EigenMetaKernelEval {
|
struct EigenMetaKernelEval {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
|
||||||
for (StorageIndex i = first; i < last; i += step_size) {
|
for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -340,17 +340,17 @@ struct EigenMetaKernelEval {
|
|||||||
template <typename Evaluator, typename StorageIndex>
|
template <typename Evaluator, typename StorageIndex>
|
||||||
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
|
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
|
||||||
static __device__ EIGEN_ALWAYS_INLINE
|
static __device__ EIGEN_ALWAYS_INLINE
|
||||||
void run(Evaluator& eval, StorageIndex first, StorageIndex last, StorageIndex step_size) {
|
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
|
||||||
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||||
const StorageIndex vectorized_size = (last / PacketSize) * PacketSize;
|
const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
|
||||||
const StorageIndex vectorized_step_size = step_size * PacketSize;
|
const StorageIndex vectorized_step_size = step_size * PacketSize;
|
||||||
|
|
||||||
// Use the vector path
|
// Use the vector path
|
||||||
for (StorageIndex i = first * PacketSize; i < vectorized_size;
|
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
|
||||||
i += vectorized_step_size) {
|
i += vectorized_step_size) {
|
||||||
eval.evalPacket(i);
|
eval.evalPacket(i);
|
||||||
}
|
}
|
||||||
for (StorageIndex i = vectorized_size + first; i < last; i += step_size) {
|
for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
|
||||||
eval.evalScalar(i);
|
eval.evalScalar(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -273,21 +273,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
const Index initialIndex = index;
|
const Index initialIndex = index;
|
||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index first = index;
|
const Index firstIdx = index;
|
||||||
const Index last = index + PacketSize - 1;
|
const Index lastIdx = index + PacketSize - 1;
|
||||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
|
||||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
|
||||||
const Index lastPaddedRight = m_outputStrides[i+1];
|
const Index lastPaddedRight = m_outputStrides[i+1];
|
||||||
|
|
||||||
if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
|
if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_outputStrides[i];
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
@ -299,21 +299,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const Index last = index + PacketSize - 1;
|
const Index lastIdx = index + PacketSize - 1;
|
||||||
const Index first = index;
|
const Index firstIdx = index;
|
||||||
const Index lastPaddedLeft = m_padding[0].first;
|
const Index lastPaddedLeft = m_padding[0].first;
|
||||||
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
|
||||||
const Index lastPaddedRight = m_outputStrides[1];
|
const Index lastPaddedRight = m_outputStrides[1];
|
||||||
|
|
||||||
if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
|
if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
inputIndex += (index - m_padding[0].first);
|
inputIndex += (index - m_padding[0].first);
|
||||||
return m_impl.template packet<Unaligned>(inputIndex);
|
return m_impl.template packet<Unaligned>(inputIndex);
|
||||||
@ -331,21 +331,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
|
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index first = index;
|
const Index firstIdx = index;
|
||||||
const Index last = index + PacketSize - 1;
|
const Index lastIdx = index + PacketSize - 1;
|
||||||
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
|
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
|
||||||
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
|
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
|
||||||
const Index lastPaddedRight = m_outputStrides[i];
|
const Index lastPaddedRight = m_outputStrides[i];
|
||||||
|
|
||||||
if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
|
if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
const Index idx = index / m_outputStrides[i+1];
|
const Index idx = index / m_outputStrides[i+1];
|
||||||
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
|
||||||
@ -357,21 +357,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const Index last = index + PacketSize - 1;
|
const Index lastIdx = index + PacketSize - 1;
|
||||||
const Index first = index;
|
const Index firstIdx = index;
|
||||||
const Index lastPaddedLeft = m_padding[NumDims-1].first;
|
const Index lastPaddedLeft = m_padding[NumDims-1].first;
|
||||||
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
|
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
|
||||||
const Index lastPaddedRight = m_outputStrides[NumDims-1];
|
const Index lastPaddedRight = m_outputStrides[NumDims-1];
|
||||||
|
|
||||||
if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
|
if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
|
else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
|
||||||
// all the coefficient are in the padding zone.
|
// all the coefficient are in the padding zone.
|
||||||
return internal::pset1<PacketReturnType>(m_paddingValue);
|
return internal::pset1<PacketReturnType>(m_paddingValue);
|
||||||
}
|
}
|
||||||
else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
|
else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
|
||||||
// all the coefficient are between the 2 padding zones.
|
// all the coefficient are between the 2 padding zones.
|
||||||
inputIndex += (index - m_padding[NumDims-1].first);
|
inputIndex += (index - m_padding[NumDims-1].first);
|
||||||
return m_impl.template packet<Unaligned>(inputIndex);
|
return m_impl.template packet<Unaligned>(inputIndex);
|
||||||
|
@ -208,8 +208,8 @@ __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Sel
|
|||||||
eigen_assert(blockDim.x == 1);
|
eigen_assert(blockDim.x == 1);
|
||||||
eigen_assert(gridDim.x == 1);
|
eigen_assert(gridDim.x == 1);
|
||||||
if (num_coeffs % 2 != 0) {
|
if (num_coeffs % 2 != 0) {
|
||||||
half last = input.m_impl.coeff(num_coeffs-1);
|
half lastCoeff = input.m_impl.coeff(num_coeffs-1);
|
||||||
*scratch = __halves2half2(last, reducer.initialize());
|
*scratch = __halves2half2(lastCoeff, reducer.initialize());
|
||||||
} else {
|
} else {
|
||||||
*scratch = reducer.template initializePacket<half2>();
|
*scratch = reducer.template initializePacket<half2>();
|
||||||
}
|
}
|
||||||
|
@ -128,7 +128,7 @@ class EventCount {
|
|||||||
|
|
||||||
// Notify wakes one or all waiting threads.
|
// Notify wakes one or all waiting threads.
|
||||||
// Must be called after changing the associated wait predicate.
|
// Must be called after changing the associated wait predicate.
|
||||||
void Notify(bool all) {
|
void Notify(bool notifyAll) {
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||||
uint64_t state = state_.load(std::memory_order_acquire);
|
uint64_t state = state_.load(std::memory_order_acquire);
|
||||||
for (;;) {
|
for (;;) {
|
||||||
@ -137,7 +137,7 @@ class EventCount {
|
|||||||
return;
|
return;
|
||||||
uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
||||||
uint64_t newstate;
|
uint64_t newstate;
|
||||||
if (all) {
|
if (notifyAll) {
|
||||||
// Reset prewait counter and empty wait list.
|
// Reset prewait counter and empty wait list.
|
||||||
newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
|
newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
|
||||||
} else if (waiters) {
|
} else if (waiters) {
|
||||||
@ -157,10 +157,10 @@ class EventCount {
|
|||||||
}
|
}
|
||||||
if (state_.compare_exchange_weak(state, newstate,
|
if (state_.compare_exchange_weak(state, newstate,
|
||||||
std::memory_order_acquire)) {
|
std::memory_order_acquire)) {
|
||||||
if (!all && waiters) return; // unblocked pre-wait thread
|
if (!notifyAll && waiters) return; // unblocked pre-wait thread
|
||||||
if ((state & kStackMask) == kStackMask) return;
|
if ((state & kStackMask) == kStackMask) return;
|
||||||
Waiter* w = &waiters_[state & kStackMask];
|
Waiter* w = &waiters_[state & kStackMask];
|
||||||
if (!all) w->next.store(nullptr, std::memory_order_relaxed);
|
if (!notifyAll) w->next.store(nullptr, std::memory_order_relaxed);
|
||||||
Unpark(w);
|
Unpark(w);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -228,6 +228,9 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
|
|||||||
EIGEN_DEPRECATED inline DynamicSparseMatrix()
|
EIGEN_DEPRECATED inline DynamicSparseMatrix()
|
||||||
: m_innerSize(0), m_data(0)
|
: m_innerSize(0), m_data(0)
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
#endif
|
||||||
eigen_assert(innerSize()==0 && outerSize()==0);
|
eigen_assert(innerSize()==0 && outerSize()==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -235,6 +238,9 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
|
|||||||
EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
|
EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
|
||||||
: m_innerSize(0)
|
: m_innerSize(0)
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
#endif
|
||||||
resize(rows, cols);
|
resize(rows, cols);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,12 +249,18 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
|
|||||||
EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
|
EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
|
||||||
: m_innerSize(0)
|
: m_innerSize(0)
|
||||||
{
|
{
|
||||||
Base::operator=(other.derived());
|
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
#endif
|
||||||
|
Base::operator=(other.derived());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
|
inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
|
||||||
: Base(), m_innerSize(0)
|
: Base(), m_innerSize(0)
|
||||||
{
|
{
|
||||||
|
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||||
|
#endif
|
||||||
*this = other.derived();
|
*this = other.derived();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user