merge Tensor module within Eigen/unsupported and update gemv BLAS wrapper

This commit is contained in:
Gael Guennebaud 2015-02-12 21:48:41 +01:00
commit 0918c51e60
119 changed files with 24100 additions and 816 deletions

View File

@ -185,6 +185,11 @@
#endif
#endif
#if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP
#endif
@ -304,6 +309,11 @@ using std::ptrdiff_t;
#include "src/Core/arch/NEON/Complex.h"
#endif
#if defined EIGEN_VECTORIZE_CUDA
#include "src/Core/arch/CUDA/PacketMath.h"
#include "src/Core/arch/CUDA/MathFunctions.h"
#endif
#include "src/Core/arch/Default/Settings.h"
#include "src/Core/functors/BinaryFunctors.h"

View File

@ -11,7 +11,7 @@
#ifndef EIGEN_GENERAL_PRODUCT_H
#define EIGEN_GENERAL_PRODUCT_H
namespace Eigen {
namespace Eigen {
enum {
Large = 2,
@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
evalToDest ? dest.data() : static_dest.data());
if(!evalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
MappedDest(actualDestPtr, dest.size()) = dest;
}
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
general_matrix_vector_product
<Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
actualLhs.data(), actualLhs.outerStride(),
actualRhs.data(), actualRhs.innerStride(),
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
actualDestPtr, 1,
compatibleAlpha);
@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
general_matrix_vector_product
<Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
<Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
actualLhs.data(), actualLhs.outerStride(),
actualRhsPtr, 1,
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhsPtr, 1),
dest.data(), dest.innerStride(),
actualAlpha);
}
@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
#ifdef EIGEN_DEBUG_PRODUCT
internal::product_type<Derived,OtherDerived>::debug();
#endif
return Product<Derived, OtherDerived>(derived(), other.derived());
}

View File

@ -54,6 +54,7 @@ struct default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 1,
HasBlend = 0,
HasDiv = 0,
HasSqrt = 0,
@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
};
};
template<typename T> struct packet_traits<const T> : packet_traits<T> { };
/** \internal \returns a + b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
padd(const Packet& a,
@ -356,7 +359,7 @@ pmadd(const Packet& a,
/** \internal \returns a packet version of \a *from.
* If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
template<typename Packet, int LoadMode>
inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
{
if(LoadMode == Aligned)
return pload<Packet>(from);
@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
/** \internal copy the packet \a from to \a *to.
* If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
template<typename Scalar, typename Packet, int LoadMode>
inline void pstoret(Scalar* to, const Packet& from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
{
if(LoadMode == Aligned)
pstore(to, from);
@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
pstoreu(to, from);
}
/** \internal \returns a packet version of \a *from.
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
* hardware if available to speedup the loading of data that won't be modified
* by the current computation.
*/
template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{
return ploadt<Packet, LoadMode>(from);
}
/** \internal default implementation of palign() allowing partial specialization */
template<int Offset,typename PacketType>
struct palign_impl
@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
}
/***************************************************************************
* Selector, i.e. vector of N boolean values used to select (i.e. blend)
* words from 2 packets.
***************************************************************************/
template <size_t N> struct Selector {
bool select[N];
};
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
return ifPacket.select[0] ? thenPacket : elsePacket;
}
} // end namespace internal
} // end namespace Eigen

View File

@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
BlockingType blocking(rhs.rows(), rhs.cols(), size);
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>

View File

@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
HasSqrt = 0,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasHalfPacket = 1,
HasDiv = 1,
HasExp = 0
HasExp = 0,
HasBlend = 1
};
};
@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
const __m256 zero = _mm256_setzero_ps();
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
const __m256d zero = _mm256_setzero_pd();
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -0,0 +1,75 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template<> EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a)
{
return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 plog<double2>(const double2& a)
{
return make_double2(log(a.x), log(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 pexp<float4>(const float4& a)
{
return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 pexp<double2>(const double2& a)
{
return make_double2(exp(a.x), exp(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 psqrt<float4>(const float4& a)
{
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 psqrt<double2>(const double2& a)
{
return make_double2(sqrt(a.x), sqrt(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 prsqrt<float4>(const float4& a)
{
return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 prsqrt<double2>(const double2& a)
{
return make_double2(rsqrt(a.x), rsqrt(a.y));
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H

View File

@ -0,0 +1,296 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H
#define EIGEN_PACKET_MATH_CUDA_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
template<> struct packet_traits<float> : default_packet_traits
{
typedef float4 type;
typedef float4 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasHalfPacket = 0,
HasDiv = 1,
HasSin = 0,
HasCos = 0,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasBlend = 0,
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef double2 type;
typedef double2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=2,
HasHalfPacket = 0,
HasDiv = 1,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasBlend = 0,
};
};
template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; };
template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
return make_double2(from, from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
return make_double2(a, a+1);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
return make_double2(a.x+b.x, a.y+b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
return make_double2(a.x-b.x, a.y-b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
return make_double2(-a.x, -a.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
return make_double2(a.x*b.x, a.y*b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
return make_double2(a.x/b.x, a.y/b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
return *reinterpret_cast<const float4*>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
return *reinterpret_cast<const double2*>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
return make_float4(from[0], from[1], from[2], from[3]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
return make_double2(from[0], from[1]);
}
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
return make_float4(from[0], from[0], from[1], from[1]);
}
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
return make_double2(from[0], from[0]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
*reinterpret_cast<float4*>(to) = from;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
*reinterpret_cast<double2*>(to) = from;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
to[0] = from.x;
to[1] = from.y;
to[2] = from.z;
to[3] = from.w;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
to[0] = from.x;
to[1] = from.y;
}
#ifdef __CUDA_ARCH__
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
return __ldg((const float4*)from);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
return __ldg((const double2*)from);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
return make_double2(__ldg(from+0), __ldg(from+1));
}
#endif
template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
}
template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
return make_double2(from[0*stride], from[1*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
to[stride*2] = from.z;
to[stride*3] = from.w;
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
}
template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
return a.x + a.y + a.z + a.w;
}
template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
return a.x + a.y;
}
template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
}
template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
return fmax(a.x, a.y);
}
template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
}
template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
return fmin(a.x, a.y);
}
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
}
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(abs(a.x), abs(a.y));
}
template<> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
tmp = kernel.packet[0].z;
kernel.packet[0].z = kernel.packet[2].x;
kernel.packet[2].x = tmp;
tmp = kernel.packet[0].w;
kernel.packet[0].w = kernel.packet[3].x;
kernel.packet[3].x = tmp;
tmp = kernel.packet[1].z;
kernel.packet[1].z = kernel.packet[2].y;
kernel.packet[2].y = tmp;
tmp = kernel.packet[1].w;
kernel.packet[1].w = kernel.packet[3].y;
kernel.packet[3].y = tmp;
tmp = kernel.packet[2].w;
kernel.packet[2].w = kernel.packet[3].z;
kernel.packet[3].z = tmp;
}
template<> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<double2,2>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H

View File

@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
HasSetLinear = 0,
HasBlend = 1
};
};
#endif
@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
}
} // end namespace internal
} // end namespace Eigen

View File

@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
#endif
@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
size=4
size=4,
HasBlend = 1
};
};
@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
const __m128i zero = _mm_setzero_si128();
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128i false_mask = _mm_cmpeq_epi32(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
#else
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
const __m128 zero = _mm_setzero_ps();
const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128 false_mask = _mm_cmpeq_ps(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
#else
return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
const __m128d zero = _mm_setzero_pd();
const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
__m128d false_mask = _mm_cmpeq_pd(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
#else
return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
#endif
}
} // end namespace internal
} // end namespace Eigen

View File

@ -26,28 +26,37 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
}
/** \internal */
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
{
static std::ptrdiff_t m_l1CacheSize = 0;
static std::ptrdiff_t m_l2CacheSize = 0;
if(m_l2CacheSize==0)
static bool m_cache_sizes_initialized = false;
static std::ptrdiff_t m_l1CacheSize = 32*1024;
static std::ptrdiff_t m_l2CacheSize = 256*1024;
static std::ptrdiff_t m_l3CacheSize = 2*1024*1024;
if(!m_cache_sizes_initialized)
{
m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
int l1CacheSize, l2CacheSize, l3CacheSize;
queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
m_l1CacheSize = manage_caching_sizes_helper(l1CacheSize, 8*1024);
m_l2CacheSize = manage_caching_sizes_helper(l2CacheSize, 256*1024);
m_l3CacheSize = manage_caching_sizes_helper(l3CacheSize, 8*1024*1024);
m_cache_sizes_initialized = true;
}
if(action==SetAction)
{
// set the cpu cache size and cache all block sizes from a global cache size in byte
eigen_internal_assert(l1!=0 && l2!=0);
m_l1CacheSize = *l1;
m_l2CacheSize = *l2;
m_l3CacheSize = *l3;
}
else if(action==GetAction)
{
eigen_internal_assert(l1!=0 && l2!=0);
*l1 = m_l1CacheSize;
*l2 = m_l2CacheSize;
*l3 = m_l3CacheSize;
}
else
{
@ -70,10 +79,11 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
#define CEIL(a, b) ((a)+(b)-1)/(b)
template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
{
EIGEN_UNUSED_VARIABLE(n);
// Explanations:
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
@ -81,43 +91,71 @@ void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
// at the register level. For vectorization purpose, these small vertical panels are unpacked,
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
// stay in L1 cache.
std::ptrdiff_t l1, l2;
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
enum {
kdiv = KcFactor * 2 * Traits::nr
* Traits::RhsProgress * sizeof(RhsScalar),
mr = gebp_traits<LhsScalar,RhsScalar>::mr,
mr_mask = (0xffffffff/mr)*mr
};
if (num_threads > 1) {
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef typename Traits::ResScalar ResScalar;
enum {
kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
k_mask = (0xffffffff/8)*8,
manage_caching_sizes(GetAction, &l1, &l2);
mr = Traits::mr,
mr_mask = (0xffffffff/mr)*mr,
// k = std::min<SizeType>(k, l1/kdiv);
// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
// if(_m<m) m = _m & mr_mask;
// In unit tests we do not want to use extra large matrices,
// so we reduce the block size to check the blocking strategy is not flawed
nr = Traits::nr,
nr_mask = (0xffffffff/nr)*nr
};
SizeType k_cache = (l1-ksub)/kdiv;
if (k_cache < k) {
k = k_cache & k_mask;
eigen_assert(k > 0);
}
SizeType n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
SizeType n_per_thread = CEIL(n, num_threads);
if (n_cache <= n_per_thread) {
// Don't exceed the capacity of the l2 cache.
eigen_assert(n_cache >= static_cast<SizeType>(nr));
n = n_cache & nr_mask;
eigen_assert(n > 0);
} else {
n = (std::min<SizeType>)(n, (n_per_thread + nr - 1) & nr_mask);
}
if (l3 > l2) {
// l3 is shared between all cores, so we'll give each thread its own chunk of l3.
SizeType m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
SizeType m_per_thread = CEIL(m, num_threads);
if(m_cache < m_per_thread && m_cache >= static_cast<SizeType>(mr)) {
m = m_cache & mr_mask;
eigen_assert(m > 0);
} else {
m = (std::min<SizeType>)(m, (m_per_thread + mr - 1) & mr_mask);
}
}
}
else {
// In unit tests we do not want to use extra large matrices,
// so we reduce the block size to check the blocking strategy is not flawed
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
// k = std::min<SizeType>(k,240);
// n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
// m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
n = std::min<SizeType>(n,3840/sizeof(RhsScalar));
m = std::min<SizeType>(m,3840/sizeof(RhsScalar));
#else
k = std::min<SizeType>(k,24);
n = std::min<SizeType>(n,384/sizeof(RhsScalar));
m = std::min<SizeType>(m,384/sizeof(RhsScalar));
k = std::min<SizeType>(k,24);
n = std::min<SizeType>(n,384/sizeof(RhsScalar));
m = std::min<SizeType>(m,384/sizeof(RhsScalar));
#endif
}
}
template<typename LhsScalar, typename RhsScalar, typename SizeType>
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n, int num_threads)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
}
#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
@ -667,7 +705,7 @@ protected:
* |real |cplx | no vectorization yet, would require to pack A with duplication
* |cplx |real | easy vectorization
*/
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@ -676,14 +714,15 @@ struct gebp_kernel
typedef typename Traits::RhsPacket RhsPacket;
typedef typename Traits::ResPacket ResPacket;
typedef typename Traits::AccPacket AccPacket;
typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
typedef typename SwappedTraits::ResScalar SResScalar;
typedef typename SwappedTraits::LhsPacket SLhsPacket;
typedef typename SwappedTraits::RhsPacket SRhsPacket;
typedef typename SwappedTraits::ResPacket SResPacket;
typedef typename SwappedTraits::AccPacket SAccPacket;
typedef typename DataMapper::LinearMapper LinearMapper;
enum {
Vectorizable = Traits::Vectorizable,
@ -693,14 +732,16 @@ struct gebp_kernel
};
EIGEN_DONT_INLINE
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
};
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
Index rows, Index depth, Index cols, ResScalar alpha,
Index strideA, Index strideB, Index offsetA, Index offsetB)
{
Traits traits;
@ -743,15 +784,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
internal::prefetch(r0);
internal::prefetch(r1);
internal::prefetch(r2);
internal::prefetch(r3);
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
r0.prefetch(0);
r1.prefetch(0);
r2.prefetch(0);
r3.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -820,48 +861,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r0+2*Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r1+2*Traits::ResPacketSize);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r0.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C1, alphav, R0);
traits.acc(C5, alphav, R1);
traits.acc(C9, alphav, R2);
pstoreu(r1+0*Traits::ResPacketSize, R0);
pstoreu(r1+1*Traits::ResPacketSize, R1);
pstoreu(r1+2*Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r2+2*Traits::ResPacketSize);
r1.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(1 * Traits::ResPacketSize, R1);
r1.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C10, alphav, R2);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r2+1*Traits::ResPacketSize, R1);
pstoreu(r2+2*Traits::ResPacketSize, R2);
R0 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r3+2*Traits::ResPacketSize);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r2.storePacket(1 * Traits::ResPacketSize, R1);
r2.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C3, alphav, R0);
traits.acc(C7, alphav, R1);
traits.acc(C11, alphav, R2);
pstoreu(r3+0*Traits::ResPacketSize, R0);
pstoreu(r3+1*Traits::ResPacketSize, R1);
pstoreu(r3+2*Traits::ResPacketSize, R2);
r3.storePacket(0 * Traits::ResPacketSize, R0);
r3.storePacket(1 * Traits::ResPacketSize, R1);
r3.storePacket(2 * Traits::ResPacketSize, R2);
}
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@ -875,7 +916,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C4);
traits.initAcc(C8);
ResScalar* r0 = &res[(j2+0)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2);
r0.prefetch(0);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -926,19 +968,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r0+2*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8 , alphav, R2);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r0+2*Traits::ResPacketSize, R2);
traits.acc(C8, alphav, R2);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r0.storePacket(2 * Traits::ResPacketSize, R2);
}
}
}
//---------- Process 2 * LhsProgress rows at once ----------
if(mr>=2*Traits::LhsProgress)
{
@ -960,15 +1002,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
internal::prefetch(r0+prefetch_res_offset);
internal::prefetch(r1+prefetch_res_offset);
internal::prefetch(r2+prefetch_res_offset);
internal::prefetch(r3+prefetch_res_offset);
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -1023,37 +1065,37 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blA += 2*Traits::LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0, R1, R2, R3;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R3 = ploadu<ResPacket>(r1+1*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C1, alphav, R2);
traits.acc(C5, alphav, R3);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
pstoreu(r1+0*Traits::ResPacketSize, R2);
pstoreu(r1+1*Traits::ResPacketSize, R3);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r2+1*Traits::ResPacketSize);
R2 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
R3 = ploadu<ResPacket>(r3+1*Traits::ResPacketSize);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
r1.storePacket(0 * Traits::ResPacketSize, R2);
r1.storePacket(1 * Traits::ResPacketSize, R3);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C3, alphav, R2);
traits.acc(C7, alphav, R3);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r2+1*Traits::ResPacketSize, R1);
pstoreu(r3+0*Traits::ResPacketSize, R2);
pstoreu(r3+1*Traits::ResPacketSize, R3);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r2.storePacket(1 * Traits::ResPacketSize, R1);
r3.storePacket(0 * Traits::ResPacketSize, R2);
r3.storePacket(1 * Traits::ResPacketSize, R3);
}
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@ -1066,8 +1108,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C0);
traits.initAcc(C4);
ResScalar* r0 = &res[(j2+0)*resStride + i];
internal::prefetch(r0+prefetch_res_offset);
LinearMapper r0 = res.getLinearMapper(i, j2);
r0.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -1117,12 +1159,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r0+1*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r0+1*Traits::ResPacketSize, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r0.storePacket(1 * Traits::ResPacketSize, R1);
}
}
}
@ -1148,15 +1190,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
traits.initAcc(C2);
traits.initAcc(C3);
ResScalar* r0 = &res[(j2+0)*resStride + i];
ResScalar* r1 = &res[(j2+1)*resStride + i];
ResScalar* r2 = &res[(j2+2)*resStride + i];
ResScalar* r3 = &res[(j2+3)*resStride + i];
internal::prefetch(r0+prefetch_res_offset);
internal::prefetch(r1+prefetch_res_offset);
internal::prefetch(r2+prefetch_res_offset);
internal::prefetch(r3+prefetch_res_offset);
LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
r0.prefetch(prefetch_res_offset);
r1.prefetch(prefetch_res_offset);
r2.prefetch(prefetch_res_offset);
r3.prefetch(prefetch_res_offset);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
@ -1206,25 +1248,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blA += 1*LhsProgress;
}
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r1+0*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
pstoreu(r0+0*Traits::ResPacketSize, R0);
pstoreu(r1+0*Traits::ResPacketSize, R1);
R0 = ploadu<ResPacket>(r2+0*Traits::ResPacketSize);
R1 = ploadu<ResPacket>(r3+0*Traits::ResPacketSize);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(0 * Traits::ResPacketSize, R1);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
pstoreu(r2+0*Traits::ResPacketSize, R0);
pstoreu(r3+0*Traits::ResPacketSize, R1);
r2.storePacket(0 * Traits::ResPacketSize, R0);
r3.storePacket(0 * Traits::ResPacketSize, R1);
}
// Deal with remaining columns of the rhs
for(Index j2=packet_cols4; j2<cols; j2++)
{
@ -1236,7 +1278,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
AccPacket C0;
traits.initAcc(C0);
ResScalar* r0 = &res[(j2+0)*resStride + i];
LinearMapper r0 = res.getLinearMapper(i, j2);
// performs "inner" products
const RhsScalar* blB = &blockB[j2*strideB+offsetB];
@ -1283,9 +1325,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = ploadu<ResPacket>(r0+0*Traits::ResPacketSize);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
pstoreu(r0+0*Traits::ResPacketSize, R0);
r0.storePacket(0 * Traits::ResPacketSize, R0);
}
}
}
@ -1301,7 +1343,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
const LhsScalar* blA = &blockA[i*strideA+offsetA];
prefetch(&blA[0]);
const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
if( (SwappedTraits::LhsProgress % 4)==0 )
{
// NOTE The following piece of code wont work for 512 bit registers
@ -1310,32 +1352,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
straits.initAcc(C1);
straits.initAcc(C2);
straits.initAcc(C3);
const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
const Index endk = (depth/spk)*spk;
const Index endk4 = (depth/(spk*4))*(spk*4);
Index k=0;
for(; k<endk4; k+=4*spk)
{
SLhsPacket A0,A1;
SRhsPacket B_0,B_1;
straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
straits.loadRhsQuad(blA+0*spk, B_0);
straits.loadRhsQuad(blA+1*spk, B_1);
straits.madd(A0,B_0,C0,B_0);
straits.madd(A1,B_1,C1,B_1);
straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
straits.loadRhsQuad(blA+2*spk, B_0);
straits.loadRhsQuad(blA+3*spk, B_1);
straits.madd(A0,B_0,C2,B_0);
straits.madd(A1,B_1,C3,B_1);
blB += 4*SwappedTraits::LhsProgress;
blA += 4*spk;
}
@ -1344,11 +1386,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
SLhsPacket A0;
SRhsPacket B_0;
straits.loadLhsUnaligned(blB, A0);
straits.loadRhsQuad(blA, B_0);
straits.madd(A0,B_0,C0,B_0);
blB += SwappedTraits::LhsProgress;
blA += spk;
}
@ -1359,10 +1401,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
SResPacketHalf R = pgather<SResScalar, SResPacketHalf>(&res[j2*resStride + i], resStride);
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
if(depth-endk>0)
{
// We have to handle the last row of the rhs which corresponds to a half-packet
@ -1378,14 +1420,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
straits.acc(predux4(C0), alphav, R);
}
pscatter(&res[j2*resStride + i], R, resStride);
res.scatterPacket(i, j2, R);
}
else
{
SResPacket R = pgather<SResScalar, SResPacket>(&res[j2*resStride + i], resStride);
SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
SResPacket alphav = pset1<SResPacket>(alpha);
straits.acc(C0, alphav, R);
pscatter(&res[j2*resStride + i], R, resStride);
res.scatterPacket(i, j2, R);
}
}
else // scalar path
@ -1397,9 +1439,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
{
LhsScalar A0;
RhsScalar B_0, B_1;
A0 = blA[k];
B_0 = blB[0];
B_1 = blB[1];
CJMADD(cj,A0,B_0,C0, B_0);
@ -1412,10 +1454,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
blB += 4;
}
res[(j2+0)*resStride + i] += alpha*C0;
res[(j2+1)*resStride + i] += alpha*C1;
res[(j2+2)*resStride + i] += alpha*C2;
res[(j2+3)*resStride + i] += alpha*C3;
res(i, j2 + 0) += alpha * C0;
res(i, j2 + 1) += alpha * C1;
res(i, j2 + 2) += alpha * C2;
res(i, j2 + 3) += alpha * C3;
}
}
}
@ -1436,7 +1478,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
RhsScalar B_0 = blB[k];
CJMADD(cj, A0, B_0, C0, B_0);
}
res[(j2+0)*resStride + i] += alpha*C0;
res(i, j2) += alpha * C0;
}
}
}
@ -1459,15 +1501,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
{
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@ -1478,30 +1521,29 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, ColMajor> lhs(_lhs,lhsStride);
Index count = 0;
const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
: Pack2>1 ? (rows/Pack2)*Pack2 : 0;
Index i=0;
// Pack 3 packets
if(Pack1>=3*PacketSize)
{
for(; i<peeled_mc3; i+=3*PacketSize)
{
if(PanelMode) count += (3*PacketSize) * offset;
for(Index k=0; k<depth; k++)
{
Packet A, B, C;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
C = lhs.loadPacket(i+2*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@ -1515,12 +1557,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc2; i+=2*PacketSize)
{
if(PanelMode) count += (2*PacketSize) * offset;
for(Index k=0; k<depth; k++)
{
Packet A, B;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
}
@ -1533,11 +1575,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc1; i+=1*PacketSize)
{
if(PanelMode) count += (1*PacketSize) * offset;
for(Index k=0; k<depth; k++)
{
Packet A;
A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
A = lhs.loadPacket(i+0*PacketSize, k);
pstore(blockA+count, cj.pconj(A));
count+=PacketSize;
}
@ -1550,11 +1592,11 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
for(; i<peeled_mc0; i+=Pack2)
{
if(PanelMode) count += Pack2 * offset;
for(Index k=0; k<depth; k++)
for(Index w=0; w<Pack2; w++)
blockA[count++] = cj(lhs(i+w, k));
if(PanelMode) count += Pack2 * (stride-offset-depth);
}
}
@ -1567,15 +1609,16 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, ColMajor, Conj
}
}
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
{
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
@ -1585,13 +1628,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
EIGEN_UNUSED_VARIABLE(offset);
eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
const_blas_data_mapper<Scalar, Index, RowMajor> lhs(_lhs,lhsStride);
Index count = 0;
// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
int pack = Pack1;
Index i = 0;
while(pack>0)
@ -1611,7 +1653,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
for (Index m = 0; m < pack; m += PacketSize)
{
PacketBlock<Packet> kernel;
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = ploadu<Packet>(&lhs(i+p+m, k));
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
}
@ -1636,15 +1678,15 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
for(;w<pack;++w)
blockA[count++] = cj(lhs(i+w, k));
}
if(PanelMode) count += pack * (stride-offset-depth);
}
pack -= PacketSize;
if(pack<Pack2 && (pack+PacketSize)!=Pack2)
pack = Pack2;
}
for(; i<rows; i++)
{
if(PanelMode) count += offset;
@ -1661,17 +1703,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, RowMajor, Conj
// 4 5 6 7 16 17 18 19 25 28
// 8 9 10 11 20 21 22 23 26 29
// . . . . . . . . . .
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@ -1727,27 +1770,27 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
// if(PanelMode) count += 8 * (stride-offset-depth);
// }
// }
if(nr>=4)
{
for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
{
// skip what we have before
if(PanelMode) count += 4 * offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const Scalar* b1 = &rhs[(j2+1)*rhsStride];
const Scalar* b2 = &rhs[(j2+2)*rhsStride];
const Scalar* b3 = &rhs[(j2+3)*rhsStride];
const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
Index k=0;
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
{
for(; k<peeled_k; k+=PacketSize) {
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
kernel.packet[0] = ploadu<Packet>(&b0[k]);
kernel.packet[1] = ploadu<Packet>(&b1[k]);
kernel.packet[2] = ploadu<Packet>(&b2[k]);
kernel.packet[3] = ploadu<Packet>(&b3[k]);
kernel.packet[0] = dm0.loadPacket(k);
kernel.packet[1] = dm1.loadPacket(k);
kernel.packet[2] = dm2.loadPacket(k);
kernel.packet[3] = dm3.loadPacket(k);
ptranspose(kernel);
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
@ -1758,10 +1801,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
for(; k<depth; k++)
{
blockB[count+0] = cj(b0[k]);
blockB[count+1] = cj(b1[k]);
blockB[count+2] = cj(b2[k]);
blockB[count+3] = cj(b3[k]);
blockB[count+0] = cj(dm0(k));
blockB[count+1] = cj(dm1(k));
blockB[count+2] = cj(dm2(k));
blockB[count+3] = cj(dm3(k));
count += 4;
}
// skip what we have after
@ -1773,10 +1816,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[(j2+0)*rhsStride];
const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k]);
blockB[count] = cj(dm0(k));
count += 1;
}
if(PanelMode) count += (stride-offset-depth);
@ -1784,17 +1827,18 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
}
// this version is optimized for row major matrices
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
typedef typename DataMapper::LinearMapper LinearMapper;
enum { PacketSize = packet_traits<Scalar>::size };
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
{
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
EIGEN_UNUSED_VARIABLE(stride);
@ -1804,7 +1848,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
Index count = 0;
// if(nr>=8)
// {
// for(Index j2=0; j2<packet_cols8; j2+=8)
@ -1847,15 +1891,15 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index k=0; k<depth; k++)
{
if (PacketSize==4) {
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
Packet A = rhs.loadPacket(k, j2);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
} else {
const Scalar* b0 = &rhs[k*rhsStride + j2];
blockB[count+0] = cj(b0[0]);
blockB[count+1] = cj(b0[1]);
blockB[count+2] = cj(b0[2]);
blockB[count+3] = cj(b0[3]);
const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
blockB[count+0] = cj(dm0(0));
blockB[count+1] = cj(dm0(1));
blockB[count+2] = cj(dm0(2));
blockB[count+3] = cj(dm0(3));
count += 4;
}
}
@ -1867,10 +1911,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
for(Index j2=packet_cols4; j2<cols; ++j2)
{
if(PanelMode) count += offset;
const Scalar* b0 = &rhs[j2];
for(Index k=0; k<depth; k++)
{
blockB[count] = cj(b0[k*rhsStride]);
blockB[count] = cj(rhs(k, j2));
count += 1;
}
if(PanelMode) count += stride-offset-depth;
@ -1883,8 +1926,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
* \sa setCpuCacheSize */
inline std::ptrdiff_t l1CacheSize()
{
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
return l1;
}
@ -1892,8 +1935,8 @@ inline std::ptrdiff_t l1CacheSize()
* \sa setCpuCacheSize */
inline std::ptrdiff_t l2CacheSize()
{
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
return l2;
}
@ -1902,9 +1945,9 @@ inline std::ptrdiff_t l2CacheSize()
* for the algorithms working per blocks.
*
* \sa computeProductBlockingSizes */
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
{
internal::manage_caching_sizes(SetAction, &l1, &l2);
internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
}
} // end namespace Eigen

View File

@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride,
ResScalar* res, Index resStride,
ResScalar* _res, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing B'.
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
// Pack A_k to A' in a parallel fashion:
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {}
info[tid].users += threads;
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
// Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k;
@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
// At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0)
while(info[i].sync!=k) {}
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
if (shift>0) {
while(info[i].sync!=k) {
}
}
gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
}
// Then keep going as usual with the remaining B'
@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
const Index actual_nc = (std::min)(j+nc,cols)-j;
// pack B_k,j to B'
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
// C_j += A' * B'
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
}
// Release all the sub blocks A'_i of A' for the current thread,
@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
for(Index k2=0; k2<depth; k2+=kc)
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
{
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
}
}
}
}
@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
{
this->m_mc = ActualRows;
this->m_nc = ActualCols;
@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
{
this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols;
this->m_kc = depth;
if(full_rows)
if(l3_blocking)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
}
else // no l3 blocking
{
DenseIndex m = this->m_mc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
}
else // full columns
{
DenseIndex n = this->m_nc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
}
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
}

View File

@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
// !!! mc must be a multiple of nr:
if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
}
}
}
@ -129,13 +134,16 @@ struct tribb_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
typedef typename Traits::ResScalar ResScalar;
enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
ResMapper res(_res, resStride);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
@ -146,7 +154,7 @@ struct tribb_kernel
const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// selfadjoint micro block
@ -154,12 +162,12 @@ struct tribb_kernel
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{
ResScalar* r = res + (j+j1)*resStride + i;
ResScalar* r = &res(i, j + j1);
for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1);
@ -169,8 +177,8 @@ struct tribb_kernel
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
depth, actualBlockSize, alpha, -1, -1, 0, 0);
}
}
}

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -48,17 +48,17 @@ namespace internal {
* // we currently fall back to the NoneAligned case
*
* The same reasoning apply for the transposed case.
*
*
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
* compared to unaligned loads on a 4 byte boundary.
*
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha);
};
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha)
{
@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
typedef typename LhsMapper::VectorMapper LhsScalars;
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
const Index ResPacketAlignedMask = ResPacketSize-1;
// const Index PeelAlignedMask = ResPacketSize*peels-1;
const Index size = rows;
const Index lhsStride = lhs.stride();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index alignedStart = internal::first_aligned(res,size);
@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
const Index lhsAlignmentOffset = lhs.firstAligned(size);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
{
alignedSize = 0;
alignedStart = 0;
alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
while (skipColumns<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// note that the skiped columns are processed later.
}
eigen_internal_assert( (alignmentPattern==NoneAligned)
/* eigen_internal_assert( (alignmentPattern==NoneAligned)
|| (skipColumns + columnsAtOnce >= cols)
|| LhsPacketSize > size
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|| (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
// this helps a lot generating better binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
if (Vectorizable)
{
@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// process initial unaligned coeffs
for (Index j=0; j<alignedStart; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
if (alignedSize>alignedStart)
@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
ResPacket T0, T1;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*ResPacketSize)
{
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
A00 = pload<LhsPacket>(&lhs0[j]);
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
A00 = lhs0.template load<LhsPacket, Aligned>(j);
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
T0 = pcj.pmadd(A01, ptmp1, T0);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
T0 = pcj.pmadd(A02, ptmp2, T0);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
T0 = pcj.pmadd(A03, ptmp3, T0);
pstore(&res[j],T0);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
T1 = pcj.pmadd(A11, ptmp1, T1);
T1 = pcj.pmadd(A12, ptmp2, T1);
T1 = pcj.pmadd(A13, ptmp3, T1);
@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
}
for (; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
}
@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
/* process remaining coeffs (or all if there is no explicit vectorization) */
for (Index j=alignedSize; j<size; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
}
@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
for (Index k=start; k<end; ++k)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
const LhsScalar* lhs0 = lhs + k*lhsStride;
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
if (Vectorizable)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for (Index j=0; j<alignedStart; ++j)
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
// process aligned result's coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
else
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
}
// process remaining scalars (or all if no explicit vectorization)
for (Index i=alignedSize; i<size; ++i)
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
}
if (skipColumns)
{
@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha);
};
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha)
{
EIGEN_UNUSED_VARIABLE(rhsIncr);
eigen_internal_assert(rhsIncr==1);
eigen_internal_assert(rhs.stride()==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
typedef typename LhsMapper::VectorMapper LhsScalars;
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
const Index rowsAtOnce = 4;
const Index peels = 2;
const Index RhsPacketAlignedMask = RhsPacketSize-1;
const Index LhsPacketAlignedMask = LhsPacketSize-1;
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
const Index depth = cols;
const Index lhsStride = lhs.stride();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index alignedStart = internal::first_aligned(rhs, depth);
Index alignedStart = rhs.firstAligned(depth);
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
{
alignedSize = 0;
alignedStart = 0;
alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
while (skipRows<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
skipRows = (std::min)(skipRows,Index(rows));
// note that the skiped columns are processed later.
}
eigen_internal_assert( alignmentPattern==NoneAligned
/* eigen_internal_assert( alignmentPattern==NoneAligned
|| LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
|| LhsPacketSize > depth
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|| (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
if (Vectorizable)
{
@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
RhsScalar b = rhs(j, 0);
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
if (alignedSize>alignedStart)
@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
* than basic unaligned loads.
*/
LhsPacket A01, A02, A03, A11, A12, A13;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*RhsPacketSize)
{
RhsPacket b = pload<RhsPacket>(&rhs[j]);
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
ptmp1 = pcj.pmadd(A01, b, ptmp1);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
ptmp2 = pcj.pmadd(A02, b, ptmp2);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
ptmp3 = pcj.pmadd(A03, b, ptmp3);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
ptmp1 = pcj.pmadd(A11, b, ptmp1);
ptmp2 = pcj.pmadd(A12, b, ptmp2);
ptmp3 = pcj.pmadd(A13, b, ptmp3);
}
}
for (; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
tmp0 += predux(ptmp0);
@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
RhsScalar b = rhs(j, 0);
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
res[i*resIncr] += alpha*tmp0;
res[(i+offset1)*resIncr] += alpha*tmp1;
@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
else
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
tmp0 += predux(ptmp0);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
res[i*resIncr] += alpha*tmp0;
}
if (skipRows)

View File

@ -49,8 +49,8 @@ inline void initParallel()
{
int nbt;
internal::manage_multi_threading(GetAction, &nbt);
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
}
/** \returns the max number of threads reserved for Eigen

View File

@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = rows;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
// kc must smaller than mc
kc = (std::min)(kc,mc);
@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
ResMapper res(_res,resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
std::size_t sizeB = kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}

View File

@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index depth = IsLower ? diagSize : _depth;
Index cols = _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
k2 = k2+actual_kc-kc;
}
pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
// the selected lhs's panel has to be split in three different parts:
// 1 - the part which is zero => skip it
@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
}
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
{
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,end)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
actual_kc, cols, alpha, -1, -1, 0, 0);
}
}
}
@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index depth = IsLower ? _depth : diagSize;
Index cols = IsLower ? diagSize : _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;
@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* geb = blockB+ts*ts;
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
// pack the triangular part of the rhs padding the unrolled blocks with zeros
if(ts>0)
@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), rhsStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
pack_rhs_panel(blockB+j2*actual_kc,
triangularBuffer.data(), triangularBuffer.outerStride(),
RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
}
@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
for (Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(mc,rows-i2);
pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
// triangular kernel
if(ts>0)
@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
Index blockOffset = IsLower ? j2 : 0;
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
alpha,
@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
blockOffset, blockOffset);// offsets
}
}
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
blockA, geb, actual_mc, actual_kc, rs,
alpha,
-1, -1, 0, 0);
@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
BlockingType blocking(stripedRows, stripedCols, stripedDepth);
BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
internal::product_triangular_matrix_matrix<Scalar, Index,
Mode, LhsIsTriangular,

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
#define EIGEN_TRIANGULARMATRIXVECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
ResMap res(_res,rows);
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
for (Index pi=0; pi<size; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? pi+actualPanelWidth : 0;
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
r, actualPanelWidth,
&lhs.coeffRef(s,pi), lhsStride,
&rhs.coeffRef(pi), rhsIncr,
LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
RhsMapper(&rhs.coeffRef(pi), rhsIncr),
&res.coeffRef(s), resIncr, alpha);
}
}
if((!IsLower) && cols>size)
{
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows, cols-size,
&lhs.coeffRef(0,size), lhsStride,
&rhs.coeffRef(size), rhsIncr,
LhsMapper(&lhs.coeffRef(0,size), lhsStride),
RhsMapper(&rhs.coeffRef(size), rhsIncr),
_res, resIncr, alpha);
}
}
@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
ResMap res(_res,rows,InnerStride<>(resIncr));
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
for (Index pi=0; pi<diagSize; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? 0 : pi + actualPanelWidth;
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
actualPanelWidth, r,
&lhs.coeffRef(pi,s), lhsStride,
&rhs.coeffRef(s), rhsIncr,
LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
RhsMapper(&rhs.coeffRef(s), rhsIncr),
&res.coeffRef(pi), resIncr, alpha);
}
}
if(IsLower && rows>diagSize)
{
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows-diagSize, cols,
&lhs.coeffRef(diagSize,0), lhsStride,
&rhs.coeffRef(0), rhsIncr,
LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
RhsMapper(&rhs.coeffRef(0), rhsIncr),
&res.coeffRef(diagSize), resIncr, alpha);
}
}
@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
else
MappedDest(actualDestPtr, dest.size()) = dest;
}
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,
@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
#endif
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,

View File

@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
level3_blocking<Scalar,Scalar>& blocking)
{
Index cols = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
TriMapper tri(_tri, triStride);
OtherMapper other(_other, otherStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
// the goal here is to subdivise the Rhs panels such that we keep some cache
// coherence when accessing the rhs elements
std::ptrdiff_t l1, l2;
manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
{
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
const Index actual_mc = (std::min)(mc,end-i2);
if (actual_mc>0)
{
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
}
}
}
@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
level3_blocking<Scalar,Scalar>& blocking)
{
Index rows = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
LhsMapper lhs(_other, otherStride);
RhsMapper rhs(_tri, triStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;
@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc;
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
// triangular packing (we only pack the panels off the diagonal,
// neglecting the blocks overlapping the diagonal
@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), triStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
}
@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
// GEBP
if(panelLength>0)
{
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
Scalar(-1),
@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
}
// pack the just computed part of lhs to A
pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
actualPanelWidth, actual_mc,
actual_kc, j2);
}
}
if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0);
}

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
>::run(size, _lhs, lhsStride, rhs);
}
};
// forward and backward substitution, row-major, rhs is a vector
template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
typename internal::conditional<
Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index startRow = IsLower ? pi : pi-actualPanelWidth;
Index startCol = IsLower ? 0 : pi;
general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
actualPanelWidth, r,
&lhs.coeffRef(startRow,startCol), lhsStride,
rhs + startCol, 1,
LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
RhsMapper(rhs + startCol, 1),
rhs + startRow, 1,
RhsScalar(-1));
}
@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index s = IsLower ? pi : i+1;
if (k>0)
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
if(!(Mode & UnitDiag))
rhs[i] /= cjLhs(i,i);
}
@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
typename internal::conditional<Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
const LhsMap&
@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
// let's directly call the low level product function because:
// 1 - it is faster to compile
// 2 - it is slighlty faster at runtime
general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
r, actualPanelWidth,
&lhs.coeffRef(endBlock,startBlock), lhsStride,
rhs+startBlock, 1,
LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
RhsMapper(rhs+startBlock, 1),
rhs+endBlock, 1, RhsScalar(-1));
}
}

View File

@ -18,13 +18,13 @@ namespace Eigen {
namespace internal {
// forward declarations
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct gebp_kernel;
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs;
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs;
template<
@ -34,7 +34,9 @@ template<
int ResStorageOrder>
struct general_matrix_matrix_product;
template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
template<typename Index,
typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
struct general_matrix_vector_product;
@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
};
// Lightweight helper class to access matrix coefficients.
// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
// and so I hope better compilation performance (time and code quality).
template<typename Scalar, typename Index, int StorageOrder>
class blas_data_mapper
{
template<typename Scalar, typename Index>
class BlasVectorMapper {
public:
blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
return m_data[i];
}
template <typename Packet, int AlignmentType>
EIGEN_ALWAYS_INLINE Packet load(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
}
template <typename Packet>
bool aligned(Index i) const {
return (size_t(m_data+i)%sizeof(Packet))==0;
}
protected:
Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
Scalar* m_data;
};
template<typename Scalar, typename Index, int AlignmentType>
class BlasLinearMapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
return m_data[i];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
return ploadt<HalfPacket, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
}
protected:
Scalar *m_data;
};
// Lightweight helper class to access matrix coefficients.
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
class blas_data_mapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
getSubMapper(Index i, Index j) const {
return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
}
EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(&operator()(i, j));
}
EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(&operator()(i, j));
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return ploadt<Packet, AlignmentType>(&operator()(i, j));
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
}
const Index stride() const { return m_stride; }
Index firstAligned(Index size) const {
if (size_t(m_data)%sizeof(Scalar)) {
return -1;
}
return internal::first_aligned(m_data, size);
}
protected:
Scalar* EIGEN_RESTRICT m_data;
const Index m_stride;
};
// lightweight helper class to access matrix coefficients (const version)
template<typename Scalar, typename Index, int StorageOrder>
class const_blas_data_mapper
{
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
public:
const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
protected:
const Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
}
};

View File

@ -382,6 +382,17 @@
#define EIGEN_HAVE_RVALUE_REFERENCES
#endif
// Does the compiler support variadic templates?
#if __cplusplus > 199711L
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
#endif
// Does the compiler support const expressions?
#if (defined(__plusplus) && __cplusplus >= 201402L) || \
EIGEN_GNUC_AT_LEAST(4,9)
#define EIGEN_HAS_CONSTEXPR 1
#endif
/** Allows to disable some optimizations which might affect the accuracy of the result.
* Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
* They currently include:
@ -546,7 +557,9 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
*/
#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#if (defined __CUDACC__)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#elif EIGEN_COMP_MSVC
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))

View File

@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
*** Implementation of generic aligned realloc (when no realloc can be used)***
*****************************************************************************/
void* aligned_malloc(std::size_t size);
void aligned_free(void *ptr);
EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
EIGEN_DEVICE_FUNC void aligned_free(void *ptr);
/** \internal
* \brief Reallocates aligned memory.
@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
*****************************************************************************/
#ifdef EIGEN_NO_MALLOC
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
}
#elif defined EIGEN_RUNTIME_NO_MALLOC
inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
{
static bool value = true;
if (update == 1)
value = new_value;
return value;
}
inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
}
#else
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{}
#endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
inline void* aligned_malloc(size_t size)
EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
{
check_that_malloc_is_allowed();
@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
}
/** \internal Frees memory allocated with aligned_malloc. */
inline void aligned_free(void *ptr)
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{
#if !EIGEN_ALIGN
std::free(ptr);
@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
*/
template<bool Align> inline void* conditional_aligned_malloc(size_t size)
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
{
return aligned_malloc(size);
}
template<> inline void* conditional_aligned_malloc<false>(size_t size)
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
{
check_that_malloc_is_allowed();
@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
}
/** \internal Frees memory allocated with conditional_aligned_malloc */
template<bool Align> inline void conditional_aligned_free(void *ptr)
template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
{
aligned_free(ptr);
}
template<> inline void conditional_aligned_free<false>(void *ptr)
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{
std::free(ptr);
}
@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
/** \internal Destructs the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
{
// always destruct an array starting from the end.
if(ptr)
@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
/** \internal Constructs the elements of an array.
* The \a size parameter tells on how many objects to call the constructor of T.
*/
template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
{
size_t i;
EIGEN_TRY
@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
*****************************************************************************/
template<typename T>
EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
{
if(size > size_t(-1) / sizeof(T))
throw_std_bad_alloc();
@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
* The default constructor of T is called.
*/
template<typename T> inline T* aligned_new(size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
}
}
template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
/** \internal Deletes objects constructed with aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void aligned_delete(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
aligned_free(ptr);
@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
/** \internal Deletes objects constructed with conditional_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
conditional_aligned_free<Align>(ptr);
}
template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
{
check_size_for_overflow<T>(new_size);
check_size_for_overflow<T>(old_size);
@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
}
template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
{
if(size==0)
return 0; // short-cut. Also fixes Bug 884
@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
return result;
}
template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
{
if(NumTraits<T>::RequireInitialization)
destruct_elements_of_array<T>(ptr, size);

View File

@ -446,6 +446,21 @@ template<typename XprType, typename CastType> struct cast_return_type
const XprType&,CastType>::type type;
};
template <typename A, typename B> struct promote_storage_type;
template <typename A> struct promote_storage_type<A,A>
{
typedef A ret;
};
template <typename A> struct promote_storage_type<A, const A>
{
typedef A ret;
};
template <typename A> struct promote_storage_type<const A, A>
{
typedef A ret;
};
/** \internal Specify the "storage kind" of applying a coefficient-wise
* binary operations between two expressions of kinds A and B respectively.
* The template parameter Functor permits to specialize the resulting storage kind wrt to

View File

@ -97,6 +97,7 @@ ENABLE_TESTING()
add_subdirectory(libs/eigen3)
add_subdirectory(libs/eigen2)
add_subdirectory(libs/tensors)
add_subdirectory(libs/BLAS)
add_subdirectory(libs/ublas)
add_subdirectory(libs/gmm)

View File

@ -0,0 +1,44 @@
if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
# unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
set(TENSOR_FOUND TRUE)
else()
find_package(Tensor)
endif()
if (TENSOR_FOUND)
include_directories(${TENSOR_INCLUDE_DIR})
btl_add_bench(btl_tensor_linear main_linear.cpp)
btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
btl_add_bench(btl_tensor_matmat main_matmat.cpp)
btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
endif()
if(NOT BTL_NOVEC)
btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
endif(NOT BTL_NOVEC)
endif (TENSOR_FOUND)

View File

@ -0,0 +1,23 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
return 0;
}

View File

@ -0,0 +1,21 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
return 0;
}

View File

@ -0,0 +1,21 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
return 0;
}

View File

@ -0,0 +1,105 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#ifndef TENSOR_INTERFACE_HH
#define TENSOR_INTERFACE_HH
#include <unsupported/Eigen/CXX11/Tensor>
#include <vector>
#include "btl.hh"
using namespace Eigen;
template<class real>
class tensor_interface
{
public :
typedef real real_type;
typedef typename Eigen::Tensor<real,2>::Index Index;
typedef std::vector<real> stl_vector;
typedef std::vector<stl_vector> stl_matrix;
typedef Eigen::Tensor<real,2> gene_matrix;
typedef Eigen::Tensor<real,1> gene_vector;
static inline std::string name( void )
{
return EIGEN_MAKESTRING(BTL_PREFIX);
}
static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
static void free_vector(gene_vector & /*B*/) {}
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
for (unsigned int j=0; j<A_stl.size() ; j++){
for (unsigned int i=0; i<A_stl[j].size() ; i++){
A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
}
}
}
static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){
B.resize(B_stl.size());
for (unsigned int i=0; i<B_stl.size() ; i++){
B.coeffRef(i) = B_stl[i];
}
}
static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){
for (unsigned int i=0; i<B_stl.size() ; i++){
B_stl[i] = B.coeff(i);
}
}
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
int N=A_stl.size();
for (int j=0;j<N;j++){
A_stl[j].resize(N);
for (int i=0;i<N;i++){
A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
}
}
}
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
X/*.noalias()*/ = A.contract(B, dims);
}
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
X/*.noalias()*/ = A.contract(B, dims);
}
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
Y += X.constant(coef) * X;
}
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
Y = X.constant(a)*X + Y.constant(b)*Y;
}
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
cible = source;
}
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
cible = source;
}
};
#endif

View File

@ -0,0 +1,305 @@
#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
typedef int TensorIndex;
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "testing/base/public/benchmark.h"
using Eigen::Tensor;
using Eigen::TensorMap;
// TODO(bsteiner): also templatize on the input type since we have users
// for int8 as well as floats.
template <typename Device> class BenchmarkSuite {
public:
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
: m_(m), k_(k), n_(n), device_(device) {
initialize();
}
BenchmarkSuite(const Device& device, size_t m)
: m_(m), k_(m), n_(m), device_(device) {
initialize();
}
~BenchmarkSuite() {
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
}
void memcpy(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
}
// Record the number of values copied per second
finalizeBenchmark(m_ * m_ * num_iters);
}
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = C.random();
}
// Record the number of random numbers generated per second
finalizeBenchmark(m_ * m_ * num_iters);
}
void slicing(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.slice(first_quadrant, quarter_sizes).device(device_) =
A.slice(first_quadrant, quarter_sizes);
C.slice(second_quadrant, quarter_sizes).device(device_) =
B.slice(second_quadrant, quarter_sizes);
C.slice(third_quadrant, quarter_sizes).device(device_) =
A.slice(third_quadrant, quarter_sizes);
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
B.slice(fourth_quadrant, quarter_sizes);
}
// Record the number of values copied from the rhs slice to the lhs slice
// each second
finalizeBenchmark(m_ * m_ * num_iters);
}
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
const Eigen::array<int, 2> shuffle(1, 0);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.shuffle(shuffle);
}
// Record the number of values shuffled from A and copied to B each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void padding(int num_iters) {
eigen_assert(m_ == k_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.pad(paddings);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void striding(int num_iters) {
eigen_assert(m_ == k_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
const Eigen::array<TensorIndex, 2> strides(1, 2);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.stride(strides);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void broadcasting(int num_iters) {
const Eigen::array<TensorIndex, 2> size_a(m_, 1);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_c(m_, n_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
#if defined(__CUDACC__)
// nvcc doesn't support cxx11
const Eigen::array<int, 2> broadcast(1, n_);
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
broadcast.set(1, n_);
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.broadcast(broadcast);
}
// Record the number of values broadcasted from A and copied to C each second
finalizeBenchmark(m_ * n_ * num_iters);
}
void coeffWiseOp(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
}
// Record the number of FLOP executed per second (2 multiplications and
// 1 addition per value)
finalizeBenchmark(3 * m_ * m_ * num_iters);
}
void algebraicFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
void transcendentalFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.exp() + B.log();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
// Simple reduction
void reduction(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size(k_, n_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size(n_);
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.sum(sum_along_dim);
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
// do a contraction which is equivalent to a matrix multiplication
void contraction(int num_iters) {
const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<float, 2>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
// Record the number of FLOP executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
}
void convolution(int num_iters, int kernel_x, int kernel_y) {
const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
const Eigen::array<TensorIndex, 2> result_sizes(
m_ - kernel_x + 1, n_ - kernel_y + 1);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.convolve(B, dims);
}
// Record the number of FLOP executed per second (kernel_size
// multiplications and additions for each value in the resulting tensor)
finalizeBenchmark(
(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
}
private:
void initialize() {
a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(float));
device_.memset(b_, 23, k_ * n_ * sizeof(float));
device_.memset(c_, 31, m_ * n_ * sizeof(float));
BenchmarkUseRealTime();
}
inline void finalizeBenchmark(int64 num_items) {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
device_.synchronize();
}
#endif
StopBenchmarkTiming();
SetBenchmarkItemsProcessed(num_items);
}
size_t m_;
size_t k_;
size_t n_;
float* a_;
float* b_;
float* c_;
Device device_;
};
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_

View File

@ -0,0 +1,156 @@
#define EIGEN_USE_THREADS
#include "base/sysinfo.h"
#include "strings/strcat.h"
#include "third_party/eigen3/tensor_benchmarks.h"
#include "thread/threadpool.h"
#ifdef __ANDROID__
#define CREATE_THREAD_POOL(threads) \
Eigen::ThreadPoolDevice device(threads);
#else
#define CREATE_THREAD_POOL(threads) \
ThreadPool tp(threads); \
tp.StartWorkers(); \
Eigen::ThreadPoolDevice device(&tp, threads);
#endif
// Simple functions
#define BM_FuncCPU(FUNC, THREADS) \
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters); \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
BM_FuncCPU(padding, 4);
BM_FuncCPU(padding, 8);
BM_FuncCPU(padding, 12);
BM_FuncCPU(striding, 4);
BM_FuncCPU(striding, 8);
BM_FuncCPU(striding, 12);
BM_FuncCPU(broadcasting, 4);
BM_FuncCPU(broadcasting, 8);
BM_FuncCPU(broadcasting, 12);
BM_FuncCPU(coeffWiseOp, 4);
BM_FuncCPU(coeffWiseOp, 8);
BM_FuncCPU(coeffWiseOp, 12);
BM_FuncCPU(algebraicFunc, 4);
BM_FuncCPU(algebraicFunc, 8);
BM_FuncCPU(algebraicFunc, 12);
BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
BM_FuncCPU(reduction, 4);
BM_FuncCPU(reduction, 8);
BM_FuncCPU(reduction, 12);
// Contractions
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
StopBenchmarkTiming(); \
if (THREADS == 1) { \
Eigen::DefaultDevice device; \
BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} else { \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
// Convolutions
#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);

View File

@ -0,0 +1,75 @@
#define EIGEN_USE_GPU
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include "strings/strcat.h"
#include "third_party/eigen3/tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(reduction);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);

View File

@ -9,6 +9,20 @@
#include "common.h"
template<typename Index, typename Scalar, int StorageOrder, bool ConjugateLhs, bool ConjugateRhs>
struct general_matrix_vector_product_wrapper
{
static void run(Index rows, Index cols,const Scalar *lhs, Index lhsStride, const Scalar *rhs, Index rhsIncr, Scalar* res, Index resIncr, Scalar alpha)
{
typedef internal::const_blas_data_mapper<Scalar,Index,StorageOrder> LhsMapper;
typedef internal::const_blas_data_mapper<Scalar,Index,RowMajor> RhsMapper;
internal::general_matrix_vector_product
<Index,Scalar,LhsMapper,StorageOrder,ConjugateLhs,Scalar,RhsMapper,ConjugateRhs>::run(
rows, cols, LhsMapper(lhs, lhsStride), RhsMapper(rhs, rhsIncr), res, resIncr, alpha);
}
};
int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealScalar *pa, int *lda, RealScalar *pb, int *incb, RealScalar *pbeta, RealScalar *pc, int *incc)
{
typedef void (*functype)(int, int, const Scalar *, int, const Scalar *, int , Scalar *, int, Scalar);
@ -20,9 +34,9 @@ int EIGEN_BLAS_FUNC(gemv)(char *opa, int *m, int *n, RealScalar *palpha, RealSca
for(int k=0; k<4; ++k)
func[k] = 0;
func[NOTR] = (internal::general_matrix_vector_product<int,Scalar,ColMajor,false,Scalar,false>::run);
func[TR ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,false,Scalar,false>::run);
func[ADJ ] = (internal::general_matrix_vector_product<int,Scalar,RowMajor,Conj, Scalar,false>::run);
func[NOTR] = (general_matrix_vector_product_wrapper<int,Scalar,ColMajor,false,false>::run);
func[TR ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,false,false>::run);
func[ADJ ] = (general_matrix_vector_product_wrapper<int,Scalar,RowMajor,Conj ,false>::run);
init = true;
}

View File

@ -56,7 +56,7 @@ int EIGEN_BLAS_FUNC(gemm)(char *opa, char *opb, int *m, int *n, int *k, RealScal
else matrix(c, *m, *n, *ldc) *= beta;
}
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,true);
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic> blocking(*m,*n,*k,1,true);
int code = OP(*opa) | (OP(*opb) << 2);
func[code](*m, *n, *k, a, *lda, b, *ldb, c, *ldc, alpha, blocking, 0);
@ -131,12 +131,12 @@ int EIGEN_BLAS_FUNC(trsm)(char *side, char *uplo, char *opa, char *diag, int *m,
if(SIDE(*side)==LEFT)
{
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
func[code](*m, *n, a, *lda, b, *ldb, blocking);
}
else
{
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
func[code](*n, *m, a, *lda, b, *ldb, blocking);
}
@ -222,12 +222,12 @@ int EIGEN_BLAS_FUNC(trmm)(char *side, char *uplo, char *opa, char *diag, int *m,
if(SIDE(*side)==LEFT)
{
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m);
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*m,1,false);
func[code](*m, *n, *m, a, *lda, tmp.data(), tmp.outerStride(), b, *ldb, alpha, blocking);
}
else
{
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n);
internal::gemm_blocking_space<ColMajor,Scalar,Scalar,Dynamic,Dynamic,Dynamic,4> blocking(*m,*n,*n,1,false);
func[code](*m, *n, *n, tmp.data(), tmp.outerStride(), a, *lda, b, *ldb, alpha, blocking);
}
return 1;
@ -577,7 +577,7 @@ int EIGEN_BLAS_FUNC(her2k)(char *uplo, char *op, int *n, int *k, RealScalar *pal
else if(*n<0) info = 3;
else if(*k<0) info = 4;
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 7;
else if(*lda<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9;
else if(*ldb<std::max(1,(OP(*op)==NOTR)?*n:*k)) info = 9;
else if(*ldc<std::max(1,*n)) info = 12;
if(info)
return xerbla_(SCALAR_SUFFIX_UP"HER2K",&info,6);

View File

@ -47,8 +47,8 @@
// protected by parenthesis against macro expansion, the min()/max() macros
// are defined here and any not-parenthesized min/max call will cause a
// compiler error.
#define min(A,B) please_protect_your_min_with_parentheses
#define max(A,B) please_protect_your_max_with_parentheses
//#define min(A,B) please_protect_your_min_with_parentheses
//#define max(A,B) please_protect_your_max_with_parentheses
#define FORBIDDEN_IDENTIFIER (this_identifier_is_forbidden_to_avoid_clashes) this_identifier_is_forbidden_to_avoid_clashes
// B0 is defined in POSIX header termios.h
@ -237,6 +237,7 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a))
#define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b))
#define VERIFY_IS_NOT_EQUAL(a, b) VERIFY(!test_is_equal(a, b))
#define VERIFY_IS_APPROX(a, b) VERIFY(test_isApprox(a, b))
#define VERIFY_IS_NOT_APPROX(a, b) VERIFY(!test_isApprox(a, b))
#define VERIFY_IS_MUCH_SMALLER_THAN(a, b) VERIFY(test_isMuchSmallerThan(a, b))

View File

@ -261,6 +261,22 @@ template<typename Scalar> void packetmath()
VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
}
}
if (internal::packet_traits<Scalar>::HasBlend) {
Packet thenPacket = internal::pload<Packet>(data1);
Packet elsePacket = internal::pload<Packet>(data2);
EIGEN_ALIGN_DEFAULT internal::Selector<PacketSize> selector;
for (int i = 0; i < PacketSize; ++i) {
selector.select[i] = i;
}
Packet blend = internal::pblend(selector, thenPacket, elsePacket);
EIGEN_ALIGN_DEFAULT Scalar result[size];
internal::pstore(result, blend);
for (int i = 0; i < PacketSize; ++i) {
VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
}
}
}
template<typename Scalar> void packetmath_real()

View File

@ -39,15 +39,16 @@ void test_product_large()
// check the functions to setup blocking sizes compile and do not segfault
// FIXME check they do what they are supposed to do !!
std::ptrdiff_t l1 = internal::random<int>(10000,20000);
std::ptrdiff_t l2 = internal::random<int>(1000000,2000000);
setCpuCacheSizes(l1,l2);
std::ptrdiff_t l2 = internal::random<int>(100000,200000);
std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
setCpuCacheSizes(l1,l2,l3);
VERIFY(l1==l1CacheSize());
VERIFY(l2==l2CacheSize());
std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
// only makes sure it compiles fine
internal::computeProductBlockingSizes<float,float>(k1,m1,n1);
internal::computeProductBlockingSizes<float,float>(k1,m1,n1,1);
}
{

View File

@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@ -21,20 +22,26 @@
* module. Note that at this stage, you should not need to include
* this module directly.
*
* It also provides a limited fallback for compilers that don't support
* CXX11 yet, such as nvcc.
*
* \code
* #include <Eigen/CXX11/Core>
* \endcode
*/
#include <array>
#include <vector>
// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
#if __cplusplus <= 199711L
#include "src/Core/util/EmulateCXX11Meta.h"
#else
#include <array>
#include "src/Core/util/CXX11Workarounds.h"
#include "src/Core/util/CXX11Meta.h"
#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#endif // EIGEN_CXX11_CORE_MODULE
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@ -1,6 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
@ -10,9 +11,10 @@
#ifndef EIGEN_CXX11_TENSOR_MODULE
#define EIGEN_CXX11_TENSOR_MODULE
#include <unsupported/Eigen/CXX11/Core>
#include "Eigen/src/Core/util/StaticAssert.h"
#include "unsupported/Eigen/CXX11/Core"
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
#include "Eigen/src/Core/util/DisableStupidWarnings.h"
/** \defgroup CXX11_Tensor_Module Tensor Module
*
@ -26,14 +28,69 @@
#include <cstddef>
#include <cstring>
#include <stdint.h>
#include "src/Tensor/TensorStorage.h"
#include "src/Tensor/Tensor.h"
#if __cplusplus > 199711
#include <random>
#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#ifdef EIGEN_USE_THREADS
#include <future>
#endif
#ifdef EIGEN_USE_GPU
#include <cuda_runtime.h>
#if defined(__CUDACC__)
#include <curand_kernel.h>
#endif
#endif
#include "Eigen/Core"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_CXX11_TENSOR_MODULE
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@ -317,7 +317,7 @@ constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts
template<typename Array, int... n>
constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
{
return {{std_array_get<sizeof...(n) - n - 1>(arr)...}};
return {{array_get<sizeof...(n) - n - 1>(arr)...}};
}
template<typename T, std::size_t N>
@ -335,9 +335,9 @@ constexpr inline std::array<T, N> array_reverse(std::array<T, N> arr)
// an infinite loop)
template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
struct h_array_reduce {
constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr)))
constexpr static inline auto run(std::array<T, N> arr) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr)))
{
return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), std_array_get<n>(arr));
return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr), array_get<n>(arr));
}
};
@ -346,7 +346,7 @@ struct h_array_reduce<Reducer, T, N, 0>
{
constexpr static inline T run(std::array<T, N> arr)
{
return std_array_get<0>(arr);
return array_get<0>(arr);
}
};
@ -370,12 +370,20 @@ constexpr inline auto array_prod(std::array<T, N> arr) -> decltype(array_reduce<
return array_reduce<product_op, T, N>(arr);
}
template<typename t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
eigen_assert(a.size() > 0);
t prod = 1;
for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
return prod;
}
/* zip an array */
template<typename Op, typename A, typename B, std::size_t N, int... n>
constexpr inline std::array<decltype(Op::run(A(), B())),N> h_array_zip(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>)
{
return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(std_array_get<n>(a), std_array_get<n>(b))... }};
return std::array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
}
template<typename Op, typename A, typename B, std::size_t N>
@ -387,9 +395,9 @@ constexpr inline std::array<decltype(Op::run(A(), B())),N> array_zip(std::array<
/* zip an array and reduce the result */
template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...))
constexpr inline auto h_array_zip_and_reduce(std::array<A, N> a, std::array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
{
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(std_array_get<n>(a), std_array_get<n>(b))...);
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
}
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
@ -403,7 +411,7 @@ constexpr inline auto array_zip_and_reduce(std::array<A, N> a, std::array<B, N>
template<typename Op, typename A, std::size_t N, int... n>
constexpr inline std::array<decltype(Op::run(A())),N> h_array_apply(std::array<A, N> a, numeric_list<int, n...>)
{
return std::array<decltype(Op::run(A())),N>{{ Op::run(std_array_get<n>(a))... }};
return std::array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
}
template<typename Op, typename A, std::size_t N>
@ -415,9 +423,9 @@ constexpr inline std::array<decltype(Op::run(A())),N> array_apply(std::array<A,
/* apply stuff to an array and reduce */
template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...))
constexpr inline auto h_array_apply_and_reduce(std::array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
{
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(std_array_get<n>(arr))...);
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
}
template<typename Reducer, typename Op, typename A, std::size_t N>
@ -497,7 +505,3 @@ InstType instantiate_by_c_array(ArrType* arr)
} // end namespace Eigen
#endif // EIGEN_CXX11META_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@ -17,9 +17,6 @@
#error Intel Compiler only supports required C++ features since version 13.1.
// note that most stuff in principle works with 13.0 but when combining
// some features, at some point 13.0 will just fail with an internal assertion
#elif defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 1))
// note that it _should_ work with 3.1 but it was only tested with 3.2
#error Clang C++ Compiler (clang++) only supports required C++ features since version 3.1.
#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
@ -42,32 +39,46 @@
namespace Eigen {
// Use std::array as Eigen array
template <typename T, std::size_t N> using array = std::array<T, N>;
namespace internal {
/* std::get is only constexpr in C++14, not yet in C++11
* - libstdc++ from version 4.7 onwards has it nevertheless,
* so use that
* - libstdc++ older versions: use _M_instance directly
* - libc++ from version 3.4 onwards has it IF compiled with
* -std=c++1y
* - libc++ older versions or -std=c++11: use __elems_ directly
* - libc++ all versions so far: use __elems_ directly
* - all other libs: use std::get to be portable, but
* this may not be constexpr
*/
#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
#define STD_GET_ARR_HACK a._M_instance[I]
#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_STD_VER) || _LIBCPP_STD_VER <= 11)
#elif defined(_LIBCPP_VERSION)
#define STD_GET_ARR_HACK a.__elems_[I]
#else
#define STD_GET_ARR_HACK std::template get<I, T, N>(a)
#endif
template<std::size_t I, class T, std::size_t N> constexpr inline T& std_array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
template<std::size_t I, class T, std::size_t N> constexpr inline T&& std_array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
template<std::size_t I, class T, std::size_t N> constexpr inline T const& std_array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
template<std::size_t I, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
template<std::size_t I, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
template<std::size_t I, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I]; }
template<std::size_t I, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I]; }
template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
#undef STD_GET_ARR_HACK
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
static const size_t value = N;
};
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<std::array<T,N> > {
static const size_t value = N;
};
/* Suppose you have a template of the form
* template<typename T> struct X;
* And you want to specialize it in such a way:

View File

@ -0,0 +1,435 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_EMULATE_CXX11_META_H
#define EIGEN_EMULATE_CXX11_META_H
namespace Eigen {
// The array class is only available starting with cxx11. Emulate our own here
// if needed
template <typename T, size_t n> class array {
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
static const std::size_t size = n;
T values[n];
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() { }
explicit EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v) {
EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
const T& v4) {
EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5) {
EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6) {
EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6, const T& v7) {
EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
values[6] = v7;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(
const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6, const T& v7, const T& v8) {
EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
values[6] = v7;
values[7] = v8;
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
array(std::initializer_list<T> l) {
eigen_assert(l.size() == n);
std::copy(l.begin(), l.end(), values);
}
#endif
};
namespace internal {
/** \internal
* \file CXX11/Core/util/EmulateCXX11Meta.h
* This file emulates a subset of the functionality provided by CXXMeta.h for
* compilers that don't yet support cxx11 such as nvcc.
*/
struct empty_list { static const std::size_t count = 0; };
template<typename T, typename Tail=empty_list> struct type_list {
typedef T HeadType;
typedef Tail TailType;
static const T head;
static const Tail tail;
static const std::size_t count = 1 + Tail::count;
};
struct null_type { };
template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
typename T7 = null_type, typename T8 = null_type>
struct make_type_list {
typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
typedef type_list<T1, tailresult> type;
};
template<> struct make_type_list<> {
typedef empty_list type;
};
template <std::size_t index, class TList> struct get_type;
template <class Head, class Tail>
struct get_type<0, type_list<Head, Tail> >
{
typedef Head type;
};
template <std::size_t i, class Head, class Tail>
struct get_type<i, type_list<Head, Tail> >
{
typedef typename get_type<i-1, Tail>::type type;
};
/* numeric list */
template <typename T, T n>
struct type2val {
typedef T type;
static const T value = n;
};
template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
typedef typename make_type_list<type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
type2val<T, V>, type2val<T, V>, type2val<T, V>,
type2val<T, V> >::type type;
};
template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
type2val<T, V>, type2val<T, V>, type2val<T, V>,
type2val<T, V>, type2val<T, V> >::type type;
};
template <std::size_t index, class NList> struct get;
template <std::size_t i>
struct get<i, empty_list>
{
get() { eigen_assert(false && "index overflow"); }
typedef void type;
static const char value = '\0';
};
template <std::size_t i, class Head>
struct get<i, type_list<Head, empty_list> >
{
get() { eigen_assert(false && "index overflow"); }
typedef void type;
static const char value = '\0';
};
template <class Head>
struct get<0, type_list<Head, empty_list> >
{
typedef typename Head::type type;
static const type value = Head::value;
};
template <class Head, class Tail>
struct get<0, type_list<Head, Tail> >
{
typedef typename Head::type type;
static const type value = Head::value;
};
template <std::size_t i, class Head, class Tail>
struct get<i, type_list<Head, Tail> >
{
typedef typename Tail::HeadType::type type;
static const type value = get<i-1, Tail>::value;
};
template <class NList> struct arg_prod {
static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
};
template <> struct arg_prod<empty_list> {
static const int value = 1;
};
template<int n, typename t>
array<t, n> repeat(t v) {
array<t, n> array;
array.fill(v);
return array;
}
template<std::size_t I, class Head, class Tail>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>& a) {
return get<I, type_list<Head, Tail> >::value;
}
template<std::size_t I, class Head, class Tail>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>& a) {
return get<I, type_list<Head, Tail> >::value;
}
template <class NList>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList& l) {
return arg_prod<NList>::value;
};
template<std::size_t n, typename t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
t prod = 1;
for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
return prod;
}
template<typename t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
return 0;
}
template<typename t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
eigen_assert(a.size() > 0);
t prod = 1;
for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
return prod;
}
template<std::size_t I, class T, std::size_t N>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
return a[I];
}
template<std::size_t I, class T, std::size_t N>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
return a[I];
}
template<std::size_t I, class T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
return a[I];
}
template<std::size_t I, class T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
return a[I];
}
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N> > {
static const size_t value = N;
};
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N>& > {
static const size_t value = N;
};
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N> > {
static const size_t value = N;
};
template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
static const size_t value = N;
};
struct sum_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
};
struct product_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
};
struct logical_and_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
};
struct logical_or_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
};
struct equal_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
};
struct not_equal_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
};
struct lesser_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
};
struct lesser_equal_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
};
struct greater_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
};
struct greater_equal_op {
template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
};
struct not_op {
template<typename A> static inline bool run(A a) { return !a; }
};
struct negation_op {
template<typename A> static inline bool run(A a) { return -a; }
};
struct greater_equal_zero_op {
template<typename A> static inline bool run(A a) { return a >= 0; }
};
template<typename Reducer, typename Op, typename A, std::size_t N>
struct ArrayApplyAndReduce {
static inline bool run(const array<A, N>& a) {
EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
for (size_t i = 2; i < N; ++i) {
result = Reducer::run(result, Op::run(a[i]));
}
return result;
}
};
template<typename Reducer, typename Op, typename A>
struct ArrayApplyAndReduce<Reducer, Op, A, 1> {
static inline bool run(const array<A, 1>& a) {
return Op::run(a[0]);
}
};
template<typename Reducer, typename Op, typename A, std::size_t N>
inline bool array_apply_and_reduce(const array<A, N>& a) {
return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
}
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
struct ArrayZipAndReduce {
static inline bool run(const array<A, N>& a, const array<B, N>& b) {
EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
for (size_t i = 2; i < N; ++i) {
result = Reducer::run(result, Op::run(a[i], b[i]));
}
return result;
}
};
template<typename Reducer, typename Op, typename A, typename B>
struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
return Op::run(a[0], b[0]);
}
};
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_EMULATE_CXX11_META_H

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
@ -55,70 +56,46 @@ namespace Eigen {
* change dramatically.</dd>
* </dl>
*
* \ref TopicStorageOrders
* \ref TopicStorageOrders
*/
template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0>
class Tensor;
namespace internal {
template<typename Scalar_, std::size_t NumIndices_, int Options_>
struct traits<Tensor<Scalar_, NumIndices_, Options_>>
{
typedef Scalar_ Scalar;
typedef Dense StorageKind;
typedef DenseIndex Index;
enum {
Options = Options_
};
};
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
struct tensor_index_linearization_helper
{
constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const& dimensions)
{
return std_array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
std_array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::size_t NumIndices, bool RowMajor>
struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
constexpr static inline Index run(std::array<Index, NumIndices> const& indices, std::array<Index, NumIndices> const&)
{
return std_array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
template<typename Scalar_, std::size_t NumIndices_, int Options_>
class Tensor
class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_> >
{
static_assert(NumIndices_ >= 1, "A tensor must have at least one index.");
public:
typedef Tensor<Scalar_, NumIndices_, Options_> Self;
typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_> > Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<Self>::StorageKind StorageKind;
typedef typename internal::traits<Self>::Index Index;
typedef typename internal::traits<Self>::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type PacketScalar;
typedef Scalar_ Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef Self DenseType;
typedef typename Base::CoeffReturnType CoeffReturnType;
typedef typename Base::PacketReturnType PacketReturnType;
constexpr static int Options = Options_;
constexpr static std::size_t NumIndices = NumIndices_;
enum {
IsAligned = bool(EIGEN_ALIGN) & !(Options_&DontAlign),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
};
static const int Options = Options_;
static const std::size_t NumIndices = NumIndices_;
typedef DSizes<Index, NumIndices_> Dimensions;
protected:
TensorStorage<Scalar, NumIndices, Dynamic, Options> m_storage;
public:
EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_STRONG_INLINE std::array<Index, NumIndices> dimensions() const { return m_storage.dimensions(); }
EIGEN_STRONG_INLINE Index size() const { return internal::array_prod(m_storage.dimensions()); }
EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// Metadata
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const { return m_storage.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
// work, because that uses base().coeffRef() - and we don't yet
@ -126,146 +103,254 @@ class Tensor
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
void setZero()
{
// FIXME: until we have implemented packet access and the
// expression engine w.r.t. nullary ops, use this
// as a kludge. Only works with POD types, but for
// any standard usage, this shouldn't be a problem
memset((void *)data(), 0, size() * sizeof(Scalar));
}
inline Self& operator=(Self const& other)
{
m_storage = other.m_storage;
return *this;
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return coeff(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#endif
inline const Scalar& coeff(const std::array<Index, NumIndices>& indices) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
inline const Scalar& coeff(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return coeffRef(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#endif
inline Scalar& coeffRef(const std::array<Index, NumIndices>& indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
inline Scalar& coeffRef(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return this->operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
{
return coeff(array<Index, 2>(i0, i1));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
{
return coeff(array<Index, 3>(i0, i1, i2));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
{
return coeff(array<Index, 4>(i0, i1, i2, i3));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
}
#endif
inline const Scalar& operator()(const std::array<Index, NumIndices>& indices) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
eigen_assert(checkIndexRange(indices));
return coeff(indices);
}
inline const Scalar& operator()(Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return coeff(index);
}
inline const Scalar& operator[](Index index) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
{
static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
// The bracket operator is only for vectors, use the parenthesis operator instead.
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff(index);
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
static_assert(sizeof...(otherIndices) + 2 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return operator()(std::array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
{
return coeffRef(array<Index, 2>(i0, i1));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
{
return coeffRef(array<Index, 3>(i0, i1, i2));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
return coeffRef(array<Index, 4>(i0, i1, i2, i3));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
{
return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
}
#endif
inline Scalar& operator()(const std::array<Index, NumIndices>& indices)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
eigen_assert(checkIndexRange(indices));
return coeffRef(indices);
}
inline Scalar& operator()(Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
inline Scalar& operator[](Index index)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
{
static_assert(NumIndices == 1, "The bracket operator is only for vectors, use the parenthesis operator instead.");
// The bracket operator is only for vectors, use the parenthesis operator instead
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(index);
}
inline Tensor()
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor()
: m_storage()
{
}
inline Tensor(const Self& other)
: m_storage(other.m_storage)
{
}
inline Tensor(Self&& other)
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const Self& other)
: m_storage(other.m_storage)
{
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Tensor(Index firstDimension, IndexTypes... otherDimensions)
: m_storage()
: m_storage(internal::array_prod(array<Index, NumIndices>{{firstDimension, otherDimensions...}}), array<Index, NumIndices>{{firstDimension, otherDimensions...}})
{
static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to construct a tensor must be equal to the rank of the tensor.");
resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
inline explicit Tensor(Index dim1)
: m_storage(dim1, array<Index, 1>(dim1))
{
EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
inline explicit Tensor(Index dim1, Index dim2)
: m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
{
EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
inline explicit Tensor(Index dim1, Index dim2, Index dim3)
: m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
{
EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
: m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
{
EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
inline explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
: m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 4>(dim1, dim2, dim3, dim4, dim5))
{
EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#endif
inline Tensor(std::array<Index, NumIndices> dimensions)
: m_storage(internal::array_prod(dimensions), dimensions)
inline explicit Tensor(const array<Index, NumIndices>& dimensions)
: m_storage(internal::array_prod(dimensions), dimensions)
{
EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other.derived());
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other.derived());
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
{
typedef TensorAssignOp<Tensor, const Tensor> Assign;
Assign assign(*this, other);
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other);
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
void resize(Index firstDimension, IndexTypes... otherDimensions)
{
static_assert(sizeof...(otherDimensions) + 1 == NumIndices, "Number of dimensions used to resize a tensor must be equal to the rank of the tensor.");
resize(std::array<Index, NumIndices>{{firstDimension, otherDimensions...}});
// The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
}
#endif
void resize(const std::array<Index, NumIndices>& dimensions)
EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
{
std::size_t i;
Index size = Index(1);
@ -282,8 +367,17 @@ class Tensor
#endif
}
EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
array<Index, NumIndices> dims;
for (std::size_t i = 0; i < NumIndices; ++i) {
dims[i] = dimensions[i];
}
resize(dims);
}
protected:
bool checkIndexRange(const std::array<Index, NumIndices>& indices) const
bool checkIndexRange(const array<Index, NumIndices>& indices) const
{
using internal::array_apply_and_reduce;
using internal::array_zip_and_reduce;
@ -298,16 +392,16 @@ class Tensor
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
}
inline Index linearizedIndex(const std::array<Index, NumIndices>& indices) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
{
return internal::tensor_index_linearization_helper<Index, NumIndices, NumIndices - 1, Options&RowMajor>::run(indices, m_storage.dimensions());
if (Options&RowMajor) {
return m_storage.dimensions().IndexOfRowMajor(indices);
} else {
return m_storage.dimensions().IndexOfColMajor(indices);
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@ -0,0 +1,164 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
namespace Eigen {
/** \class TensorAssign
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor assignment class.
*
* This class is represents the assignment of the values resulting from the evaluation of
* the rhs expression to the memory locations denoted by the lhs expression.
*/
namespace internal {
template<typename LhsXprType, typename RhsXprType>
struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
{
typedef typename LhsXprType::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename traits<LhsXprType>::StorageKind StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
static const int Layout = internal::traits<LhsXprType>::Layout;
enum {
Flags = 0,
};
};
template<typename LhsXprType, typename RhsXprType>
struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
};
template<typename LhsXprType, typename RhsXprType>
struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
{
typedef TensorAssignOp<LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename LhsXprType, typename RhsXprType>
class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
{
public:
typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorAssignOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
typedef typename LhsXprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
};
template<typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
{
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
// The dimensions of the lhs and the rhs tensors should be equal to prevent
// overflows and ensure the result is fully initialized.
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_leftImpl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use left impl instead if right impl dimensions are known at compile time.
return m_rightImpl.dimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
m_leftImpl.evalSubExprsIfNeeded(NULL);
// If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
// null value), attempt to evaluate the rhs expression in place. Returns true iff in place
// evaluation isn't supported and the caller still needs to manually assign the values generated
// by the rhs to the lhs.
return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_leftImpl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
return m_leftImpl.template packet<LoadMode>(index);
}
private:
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
}
#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H

View File

@ -0,0 +1,573 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
namespace Eigen {
/** \class TensorBase
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor base class.
*
* This class is the common parent of the Tensor and TensorMap class, thus
* making it possible to use either class interchangably in expressions.
*/
template<typename Derived>
class TensorBase<Derived, ReadOnlyAccessors>
{
public:
typedef internal::traits<Derived> DerivedTraits;
typedef typename DerivedTraits::Scalar Scalar;
typedef typename DerivedTraits::Index Index;
typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
static const int NumDimensions = DerivedTraits::NumDimensions;
// Generic nullary operation support.
template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
nullaryExpr(const CustomNullaryOp& func) const {
return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
}
// Coefficient-wise nullary operators
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
constant(const Scalar& value) const {
return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
random() const {
return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
}
template <typename RandomGenerator> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
random() const {
return nullaryExpr(RandomGenerator());
}
// Generic unary operation support.
template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
unaryExpr(const CustomUnaryOp& func) const {
return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
}
// Coefficient-wise unary operators
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
operator-() const {
return unaryExpr(internal::scalar_opposite_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
sqrt() const {
return unaryExpr(internal::scalar_sqrt_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
square() const {
return unaryExpr(internal::scalar_square_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
cube() const {
return unaryExpr(internal::scalar_cube_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
inverse() const {
return unaryExpr(internal::scalar_inverse_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
exp() const {
return unaryExpr(internal::scalar_exp_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
log() const {
return unaryExpr(internal::scalar_log_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
abs() const {
return unaryExpr(internal::scalar_abs_op<Scalar>());
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
pow(Scalar exponent) const {
return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
operator+ (Scalar rhs) const {
return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
operator- (Scalar rhs) const {
EIGEN_STATIC_ASSERT((std::numeric_limits<Scalar>::is_signed || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
operator* (Scalar rhs) const {
return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
operator/ (Scalar rhs) const {
// EIGEN_STATIC_ASSERT(!std::numeric_limits<Scalar>::is_integer, YOU_MADE_A_PROGRAMMING_MISTAKE);
return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
cwiseMax(Scalar threshold) const {
return cwiseMax(constant(threshold));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
cwiseMin(Scalar threshold) const {
return cwiseMin(constant(threshold));
}
template <typename NewType> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived>
cast() const {
return unaryExpr(internal::scalar_cast_op<Scalar, NewType>());
}
// Generic binary operation support.
template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
}
// Coefficient-wise binary operators.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
operator+(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
operator-(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
operator*(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
operator/(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
cwiseMax(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_max_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
cwiseMin(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_min_op<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
operator&&(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
operator||(const OtherDerived& other) const {
return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
}
// Comparisons and tests.
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::less<Scalar>, const Derived, const OtherDerived>
operator<(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::less<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::less_equal<Scalar>, const Derived, const OtherDerived>
operator<=(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::less_equal<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::greater<Scalar>, const Derived, const OtherDerived>
operator>(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::greater<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::greater_equal<Scalar>, const Derived, const OtherDerived>
operator>=(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::greater_equal<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
operator==(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::equal_to<Scalar>());
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorCwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
operator!=(const OtherDerived& other) const {
return binaryExpr(other.derived(), std::not_equal_to<Scalar>());
}
// Coefficient-wise ternary operators.
template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
}
// Contractions.
typedef Eigen::IndexPair<Index> DimensionPair;
template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
contract(const OtherDerived& other, const Dimensions& dims) const {
return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
}
// Convolutions.
template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
convolve(const KernelDerived& kernel, const Dimensions& dims) const {
return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
}
// Reductions.
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
sum(const Dims& dims) const {
return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
}
const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
sum() const {
array<Index, NumDimensions> in_dims;
for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
mean(const Dims& dims) const {
return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
}
const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
mean() const {
array<Index, NumDimensions> in_dims;
for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
prod(const Dims& dims) const {
return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
}
const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
prod() const {
array<Index, NumDimensions> in_dims;
for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>
maximum(const Dims& dims) const {
return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType>());
}
const TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
maximum() const {
array<Index, NumDimensions> in_dims;
for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
return TensorReductionOp<internal::MaxReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType>());
}
template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>
minimum(const Dims& dims) const {
return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType>());
}
const TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>
minimum() const {
array<Index, NumDimensions> in_dims;
for (int i = 0; i < NumDimensions; ++i) in_dims[i] = i;
return TensorReductionOp<internal::MinReducer<CoeffReturnType>, const array<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType>());
}
template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReductionOp<Reducer, const Dims, const Derived>
reduce(const Dims& dims, const Reducer& reducer) const {
return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
}
template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorBroadcastingOp<const Broadcast, const Derived>
broadcast(const Broadcast& broadcast) const {
return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
}
template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
concatenate(const OtherDerived& other, Axis axis) const {
return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
}
template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorPatchOp<const PatchDims, const Derived>
extract_patches(const PatchDims& patch_dims) const {
return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
}
template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorImagePatchOp<Rows, Cols, const Derived>
extract_image_patches() const {
return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, PADDING_SAME);
}
template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorImagePatchOp<Rows, Cols, const Derived>
extract_image_patches(const PaddingType padding_type) const {
return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, 1, 1, padding_type);
}
template <Index Rows, Index Cols> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorImagePatchOp<Rows, Cols, const Derived>
extract_image_patches(const Index stride, const PaddingType padding_type) const {
return TensorImagePatchOp<Rows, Cols, const Derived>(derived(), Rows, Cols, stride, stride, padding_type);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
extract_image_patches(const Index patch_rows, const Index patch_cols,
const Index row_stride = 1, const Index col_stride = 1) const {
return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
PADDING_SAME);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
extract_image_patches(const Index patch_rows, const Index patch_cols,
const Index row_stride, const Index col_stride,
const PaddingType padding_type) const {
return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
padding_type);
}
// Morphing operators.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorLayoutSwapOp<const Derived>
swap_layout() const {
return TensorLayoutSwapOp<const Derived>(derived());
}
template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReshapingOp<const NewDimensions, const Derived>
reshape(const NewDimensions& newDimensions) const {
return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
}
template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
slice(const StartIndices& startIndices, const Sizes& sizes) const {
return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
}
template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<DimId, const Derived>
chip(const Index offset) const {
return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorChippingOp<Dynamic, const Derived>
chip(const Index offset, const Index dim) const {
return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
}
template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorReverseOp<const ReverseDimensions, const Derived>
reverse(const ReverseDimensions& rev) const {
return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
}
template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorPaddingOp<const PaddingDimensions, const Derived>
pad(const PaddingDimensions& padding) const {
return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding);
}
template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorShufflingOp<const Shuffle, const Derived>
shuffle(const Shuffle& shuffle) const {
return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
}
template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorStridingOp<const Strides, const Derived>
stride(const Strides& strides) const {
return TensorStridingOp<const Strides, const Derived>(derived(), strides);
}
// Force the evaluation of the expression.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const TensorForcedEvalOp<const Derived> eval() const {
return TensorForcedEvalOp<const Derived>(derived());
}
protected:
template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
template <typename Scalar, int Options> friend class TensorVarDim;
template <typename OtherDerived, int AccessLevel> friend class TensorBase;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
template<typename Derived>
class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
public:
typedef internal::traits<Derived> DerivedTraits;
typedef typename DerivedTraits::Scalar Scalar;
typedef typename DerivedTraits::Index Index;
typedef Scalar CoeffReturnType;
typedef typename internal::packet_traits<Scalar>::type PacketReturnType;
static const int NumDimensions = DerivedTraits::NumDimensions;
template <typename Scalar, std::size_t NumIndices, int Options> friend class Tensor;
template <typename Scalar, int Options> friend class TensorVarDim;
template <typename OtherDerived, int AccessLevel> friend class TensorBase;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setZero() {
return setConstant(Scalar(0));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
return derived() = this->constant(val);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setRandom() {
return derived() = this->random();
}
template <typename RandomGenerator> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setRandom() {
return derived() = this->template random<RandomGenerator>();
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& setValues(
const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
return derived();
}
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator+=(const OtherDerived& other) {
return derived() = derived() + other.derived();
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const OtherDerived& other) {
return derived() = derived() - other.derived();
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator*=(const OtherDerived& other) {
return derived() = derived() * other.derived();
}
template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator/=(const OtherDerived& other) {
return derived() = derived() / other.derived();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorLayoutSwapOp<Derived>
swap_layout() const {
return TensorLayoutSwapOp<Derived>(derived());
}
template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorReshapingOp<const NewDimensions, Derived>
reshape(const NewDimensions& newDimensions) const {
return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
}
template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorSlicingOp<const StartIndices, const Sizes, Derived>
slice(const StartIndices& startIndices, const Sizes& sizes) const {
return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
}
template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorChippingOp<DimId, Derived>
chip(const Index offset) const {
return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorChippingOp<Dynamic, Derived>
chip(const Index offset, const Index dim) const {
return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
}
template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorShufflingOp<const Shuffle, Derived>
shuffle(const Shuffle& shuffle) const {
return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
}
template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorStridingOp<const Strides, Derived>
stride(const Strides& strides) const {
return TensorStridingOp<const Strides, Derived>(derived(), strides);
}
// Select the device on which to evaluate the expression.
template <typename DeviceType>
TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
return TensorDevice<Derived, DeviceType>(device, derived());
}
protected:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H

View File

@ -0,0 +1,341 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
namespace Eigen {
/** \class TensorBroadcasting
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor broadcasting class.
*
*
*/
namespace internal {
template<typename Broadcast, typename XprType>
struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename Broadcast, typename XprType>
struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
{
typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
};
template<typename Broadcast, typename XprType>
struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
{
typedef TensorBroadcastingOp<Broadcast, XprType> type;
};
} // end namespace internal
template<typename Broadcast, typename XprType>
class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
: m_xpr(expr), m_broadcast(broadcast) {}
EIGEN_DEVICE_FUNC
const Broadcast& broadcast() const { return m_broadcast; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const Broadcast m_broadcast;
};
// Eval as rvalue
template<typename Broadcast, typename ArgType, typename Device>
struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
{
typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Broadcast& broadcast = op.broadcast();
for (int i = 0; i < NumDims; ++i) {
eigen_assert(input_dims[i] > 0);
m_dimensions[i] = input_dims[i] * broadcast[i];
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
} else {
m_inputStrides[NumDims-1] = 1;
m_outputStrides[NumDims-1] = 1;
for (int i = NumDims-2; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
}
}
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
{
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return coeffColMajor(index);
} else {
return coeffRowMajor(index);
}
}
// TODO: attempt to speed this up. The integer divisions and modulo are slow
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
{
Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else {
if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
if (internal::index_statically_eq<Broadcast>()(0, 1)) {
eigen_assert(index < m_impl.dimensions()[0]);
inputIndex += index;
} else {
if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
eigen_assert(index % m_impl.dimensions()[0] == 0);
} else {
inputIndex += (index % m_impl.dimensions()[0]);
}
}
return m_impl.coeff(inputIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
{
Index inputIndex = 0;
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else {
if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
inputIndex += index;
} else {
if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
} else {
inputIndex += (index % m_impl.dimensions()[NumDims-1]);
}
}
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
{
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return packetColMajor<LoadMode>(index);
} else {
return packetRowMajor<LoadMode>(index);
}
}
// Ignore the LoadMode and always use unaligned loads since we can't guarantee
// the alignment at compile time.
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else {
if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
Index innermostLoc;
if (internal::index_statically_eq<Broadcast>()(0, 1)) {
eigen_assert(index < m_impl.dimensions()[0]);
innermostLoc = index;
} else {
if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
innermostLoc = 0;
} else {
innermostLoc = index % m_impl.dimensions()[0];
}
}
inputIndex += innermostLoc;
// Todo: this could be extended to the second dimension if we're not
// broadcasting alongside the first dimension, and so on.
if (innermostLoc + packetSize <= m_impl.dimensions()[0]) {
return m_impl.template packet<Unaligned>(inputIndex);
} else {
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
values[0] = m_impl.coeff(inputIndex);
for (int i = 1; i < packetSize; ++i) {
values[i] = coeffColMajor(originalIndex+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
const Index originalIndex = index;
Index inputIndex = 0;
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (internal::index_statically_eq<Broadcast>()(i, 1)) {
eigen_assert(idx < m_impl.dimensions()[i]);
inputIndex += idx * m_inputStrides[i];
} else {
if (internal::index_statically_eq<InputDimensions>()(i, 1)) {
eigen_assert(idx % m_impl.dimensions()[i] == 0);
} else {
inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
}
}
index -= idx * m_outputStrides[i];
}
Index innermostLoc;
if (internal::index_statically_eq<Broadcast>()(NumDims-1, 1)) {
eigen_assert(index < m_impl.dimensions()[NumDims-1]);
innermostLoc = index;
} else {
if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
innermostLoc = 0;
} else {
innermostLoc = index % m_impl.dimensions()[NumDims-1];
}
}
inputIndex += innermostLoc;
// Todo: this could be extended to the second dimension if we're not
// broadcasting alongside the first dimension, and so on.
if (innermostLoc + packetSize <= m_impl.dimensions()[NumDims-1]) {
return m_impl.template packet<Unaligned>(inputIndex);
} else {
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
values[0] = m_impl.coeff(inputIndex);
for (int i = 1; i < packetSize; ++i) {
values[i] = coeffRowMajor(originalIndex+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H

View File

@ -0,0 +1,363 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
namespace Eigen {
/** \class TensorKChippingReshaping
* \ingroup CXX11_Tensor_Module
*
* \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
*
*
*/
namespace internal {
template<DenseIndex DimId, typename XprType>
struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions - 1;
static const int Layout = XprTraits::Layout;
};
template<DenseIndex DimId, typename XprType>
struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
{
typedef const TensorChippingOp<DimId, XprType>& type;
};
template<DenseIndex DimId, typename XprType>
struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
{
typedef TensorChippingOp<DimId, XprType> type;
};
template <DenseIndex DimId>
struct DimensionId
{
DimensionId(DenseIndex dim) {
eigen_assert(dim == DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
return DimId;
}
};
template <>
struct DimensionId<Dynamic>
{
DimensionId(DenseIndex dim) : actual_dim(dim) {
eigen_assert(dim >= 0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
return actual_dim;
}
private:
const DenseIndex actual_dim;
};
} // end namespace internal
template<DenseIndex DimId, typename XprType>
class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorChippingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
: m_xpr(expr), m_offset(offset), m_dim(dim) {
}
EIGEN_DEVICE_FUNC
const Index offset() const { return m_offset; }
EIGEN_DEVICE_FUNC
const Index dim() const { return m_dim.actualDim(); }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other)
{
typedef TensorAssignOp<TensorChippingOp, const TensorChippingOp> Assign;
Assign assign(*this, other);
static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorChippingOp, const OtherDerived> Assign;
Assign assign(*this, other);
static const bool Vectorize = TensorEvaluator<const Assign, DefaultDevice>::PacketAccess;
internal::TensorExecutor<const Assign, DefaultDevice, Vectorize>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
const Index m_offset;
const internal::DimensionId<DimId> m_dim;
};
// Eval as rvalue
template<DenseIndex DimId, typename ArgType, typename Device>
struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{
typedef TensorChippingOp<DimId, ArgType> XprType;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims-1;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets.
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
// We could also support the case where NumInputDims==1 if needed.
EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int j = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (i != m_dim.actualDim()) {
m_dimensions[j] = input_dims[i];
++j;
}
}
m_stride = 1;
m_inputStride = 1;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < m_dim.actualDim(); ++i) {
m_stride *= input_dims[i];
m_inputStride *= input_dims[i];
}
} else {
for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
m_stride *= input_dims[i];
m_inputStride *= input_dims[i];
}
}
m_inputStride *= input_dims[m_dim.actualDim()];
m_inputOffset = m_stride * op.offset();
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = m_impl.coeff(inputIndex);
inputIndex += m_inputStride;
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
return m_impl.template packet<LoadMode>(index + m_inputOffset);
} else {
const Index idx = index / m_stride;
const Index rem = index - idx * m_stride;
if (rem + packetSize <= m_stride) {
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
return m_impl.template packet<LoadMode>(inputIndex);
} else {
// Cross the stride boundary. Fallback to slow path.
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index);
++index;
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
Scalar* result = m_impl.data();
if (m_dim.actualDim() == NumDims && result) {
return result + m_inputOffset;
} else {
return NULL;
}
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset;
} else {
const Index idx = index / m_stride;
inputIndex = idx * m_inputStride + m_inputOffset;
index -= idx * m_stride;
inputIndex += index;
}
return inputIndex;
}
Dimensions m_dimensions;
Index m_stride;
Index m_inputOffset;
Index m_inputStride;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device& m_device;
};
// Eval as lvalue
template<DenseIndex DimId, typename ArgType, typename Device>
struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
: public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{
typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
typedef TensorChippingOp<DimId, ArgType> XprType;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims-1;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(this->m_stride == 1);
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
for (int i = 0; i < packetSize; ++i) {
this->m_impl.coeffRef(inputIndex) = values[i];
inputIndex += this->m_inputStride;
}
} else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(this->m_stride > index);
this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
} else {
const Index idx = index / this->m_stride;
const Index rem = index - idx * this->m_stride;
if (rem + packetSize <= this->m_stride) {
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
} else {
// Cross stride boundary. Fallback to slow path.
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
for (int i = 0; i < packetSize; ++i) {
this->coeffRef(index) = values[i];
++index;
}
}
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H

View File

@ -0,0 +1,258 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
namespace Eigen {
/** \class TensorConcatenationOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor concatenation class.
*
*
*/
namespace internal {
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename promote_storage_type<typename LhsXprType::Scalar,
typename RhsXprType::Scalar>::ret Scalar;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
static const int Layout = traits<LhsXprType>::Layout;
enum { Flags = 0 };
};
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
};
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
{
typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename Axis, typename LhsXprType, typename RhsXprType>
class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
{
public:
typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
typedef typename internal::traits<TensorConcatenationOp>::Index Index;
typedef typename internal::nested<TensorConcatenationOp>::type Nested;
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
typename RhsXprType::PacketReturnType>::ret PacketReturnType;
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; }
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const Axis m_axis;
};
// Eval as rvalue
template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
{
typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(0 <= m_axis && m_axis < NumDims);
const Dimensions& lhs_dims = m_leftImpl.dimensions();
const Dimensions& rhs_dims = m_rightImpl.dimensions();
int i = 0;
for (; i < m_axis; ++i) {
eigen_assert(lhs_dims[i] > 0);
eigen_assert(lhs_dims[i] == rhs_dims[i]);
m_dimensions[i] = lhs_dims[i];
}
eigen_assert(lhs_dims[i] > 0); // Now i == m_axis.
eigen_assert(rhs_dims[i] > 0);
m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
for (++i; i < NumDims; ++i) {
eigen_assert(lhs_dims[i] > 0);
eigen_assert(lhs_dims[i] == rhs_dims[i]);
m_dimensions[i] = lhs_dims[i];
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_leftStrides[0] = 1;
m_rightStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
} else {
m_leftStrides[NumDims - 1] = 1;
m_rightStrides[NumDims - 1] = 1;
m_outputStrides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_leftStrides[i] = m_leftStrides[i+1] * lhs_dims[i+1];
m_rightStrides[i] = m_rightStrides[i+1] * rhs_dims[i+1];
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
// TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
{
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
{
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
// TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
// See CL/76180724 comments for more ideas.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Collect dimension-wise indices (subs).
array<Index, NumDims> subs;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
subs[i] = index / m_outputStrides[i];
index -= subs[i] * m_outputStrides[i];
}
subs[0] = index;
} else {
for (int i = 0; i < NumDims - 1; ++i) {
subs[i] = index / m_outputStrides[i];
index -= subs[i] * m_outputStrides[i];
}
subs[NumDims - 1] = index;
}
const Dimensions& left_dims = m_leftImpl.dimensions();
if (subs[m_axis] < left_dims[m_axis]) {
Index left_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
left_index = subs[0];
for (int i = 1; i < NumDims; ++i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
} else {
left_index = subs[NumDims - 1];
for (int i = NumDims - 2; i >= 0; --i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
}
return m_leftImpl.coeff(left_index);
} else {
subs[m_axis] -= left_dims[m_axis];
const Dimensions& right_dims = m_rightImpl.dimensions();
Index right_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
right_index = subs[0];
for (int i = 1; i < NumDims; ++i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
} else {
right_index = subs[NumDims - 1];
for (int i = NumDims - 2; i >= 0; --i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
}
return m_rightImpl.coeff(right_index);
}
}
// TODO(phli): Add a real vectorization.
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_leftStrides;
array<Index, NumDims> m_rightStrides;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
const Axis m_axis;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H

View File

@ -0,0 +1,992 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
namespace Eigen {
/** \class TensorContraction
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor contraction class.
*
*
*/
namespace internal {
enum {
Rhs = 0,
Lhs = 1,
};
/*
* Implementation of the Eigen blas_data_mapper class for tensors.
*/
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
size_t packet_size, bool inner_dim_contiguous>
class BaseTensorContractionMapper {
public:
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides) :
m_tensor(tensor),
m_nocontract_strides(nocontract_strides),
m_ij_strides(ij_strides),
m_contract_strides(contract_strides),
m_k_strides(k_strides) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
// column major assumption
return operator()(row, 0);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
return m_tensor.coeff(computeIndex(row, col));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
const bool left = (side == Lhs);
Index nocontract_val = left ? row : col;
Index linidx = 0;
for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
const Index idx = nocontract_val / m_ij_strides[i];
linidx += idx * m_nocontract_strides[i];
nocontract_val -= idx * m_ij_strides[i];
}
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx += nocontract_val;
} else {
linidx += nocontract_val * m_nocontract_strides[0];
}
}
Index contract_val = left ? col : row;
for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
const Index idx = contract_val / m_k_strides[i];
linidx += idx * m_contract_strides[i];
contract_val -= idx * m_k_strides[i];
}
EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
if (side == Rhs && inner_dim_contiguous) {
eigen_assert(m_contract_strides[0] == 1);
linidx += contract_val;
} else {
linidx += contract_val * m_contract_strides[0];
}
return linidx;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
const bool left = (side == Lhs);
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
for (int i = array_size<nocontract_t>::value - 1; i > 0; i--) {
const Index idx0 = nocontract_val[0] / m_ij_strides[i];
const Index idx1 = nocontract_val[1] / m_ij_strides[i];
linidx[0] += idx0 * m_nocontract_strides[i];
linidx[1] += idx1 * m_nocontract_strides[i];
nocontract_val[0] -= idx0 * m_ij_strides[i];
nocontract_val[1] -= idx1 * m_ij_strides[i];
}
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx[0] += nocontract_val[0];
linidx[1] += nocontract_val[1];
} else {
linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
}
}
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
for (int i = array_size<contract_t>::value - 1; i > 0; i--) {
const Index idx0 = contract_val[0] / m_k_strides[i];
const Index idx1 = contract_val[1] / m_k_strides[i];
linidx[0] += idx0 * m_contract_strides[i];
linidx[1] += idx1 * m_contract_strides[i];
contract_val[0] -= idx0 * m_k_strides[i];
contract_val[1] -= idx1 * m_k_strides[i];
}
EIGEN_STATIC_ASSERT(array_size<contract_t>::value > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
if (side == Rhs && inner_dim_contiguous) {
eigen_assert(m_contract_strides[0] == 1);
linidx[0] += contract_val[0];
linidx[1] += contract_val[1];
} else {
linidx[0] += contract_val[0] * m_contract_strides[0];
linidx[1] += contract_val[1] * m_contract_strides[0];
}
return IndexPair<Index>(linidx[0], linidx[1]);
}
Index firstAligned(Index size) const {
return size;
}
Index stride() const {
return 1;
}
protected:
const Tensor m_tensor;
const nocontract_t m_nocontract_strides;
const nocontract_t m_ij_strides;
const contract_t m_contract_strides;
const contract_t m_k_strides;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
size_t packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper;
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
size_t packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionSubMapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef Self LinearMapper;
EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
: m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
return m_base_mapper(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return m_base_mapper.loadPacket(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return m_base_mapper.loadPacket(i + m_vert_offset, j + m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
return m_base_mapper.loadHalfPacket(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT, int AlignmentType>
EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((AlignmentType == Aligned || Alignment == Unaligned), YOU_MADE_A_PROGRAMMING_MISTAKE);
return loadPacket(i);
}
template <typename Packet>
bool aligned(Index /*i*/) const {
return false;
}
private:
const ParentMapper& m_base_mapper;
const Index m_vert_offset;
const Index m_horiz_offset;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
size_t packet_size = (Tensor::PacketAccess ? packet_traits<Scalar>::size : 1),
bool inner_dim_contiguous = false, bool inner_dim_reordered = (side != Lhs), int Alignment=Unaligned>
class TensorContractionInputMapper
: public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> {
public:
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
TensorContractionInputMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides)
: Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
return SubMapper(*this, i, j);
}
EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(*this, i, j);
}
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
// current code assumes packet size must be a multiple of 2
EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
const Index index = this->computeIndex(i, j);
eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
return this->m_tensor.template packet<Alignment>(index);
}
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
const Index first = indexPair.first;
const Index last = indexPair.second;
// We can always do optimized packet reads from left hand side right now, because
// the vertical matrix dimension on the left hand side is never contracting.
// On the right hand side we need to check if the contracting dimensions may have
// been shuffled first.
if (Tensor::PacketAccess &&
(side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
(last - first) == (packet_size - 1)) {
return this->m_tensor.template packet<Alignment>(first);
}
EIGEN_ALIGN_DEFAULT Scalar data[packet_size];
data[0] = this->m_tensor.coeff(first);
for (Index k = 1; k < packet_size - 1; k += 2) {
const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
data[k] = this->m_tensor.coeff(internal_pair.first);
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
}
data[packet_size - 1] = this->m_tensor.coeff(last);
return pload<Packet>(data);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
const Index half_packet_size = unpacket_traits<HalfPacket>::size;
if (half_packet_size == packet_size) {
return loadPacket(i, j);
}
EIGEN_ALIGN_DEFAULT Scalar data[half_packet_size];
for (Index k = 0; k < half_packet_size; k++) {
data[k] = operator()(i + k, j);
}
return pload<HalfPacket>(data);
}
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
: public BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> {
public:
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
TensorContractionInputMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides)
: Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
return SubMapper(*this, i, j);
}
EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(*this, i, j);
}
typedef typename packet_traits<Scalar>::type Packet;
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
EIGEN_ALIGN_DEFAULT Scalar data[1];
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<typename packet_traits<Scalar>::type>(data);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
return loadPacket(i, j);
}
};
template <size_t n> struct max_n_1 {
static const size_t size = n;
};
template <> struct max_n_1<0> {
static const size_t size = 1;
};
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
typename RhsXprType::Scalar>::ret Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
// From NumDims below.
static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
static const int Layout = traits<LhsXprType>::Layout;
enum {
Flags = 0,
};
};
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
};
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
{
typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
};
template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
typedef Indices_ Indices;
typedef LeftArgType_ LeftArgType;
typedef RightArgType_ RightArgType;
typedef Device_ Device;
// From NumDims below.
static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
};
} // end namespace internal
template<typename Indices, typename LhsXprType, typename RhsXprType>
class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorContractionOp>::Packet Packet;
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
typename RhsXprType::PacketReturnType>::ret PacketReturnType;
typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
EIGEN_DEVICE_FUNC
const Indices& indices() const { return m_indices; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const Indices m_indices;
};
template<bool cond> struct Cond {};
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
const T1& choose(Cond<true>, const T1& first, const T2&) {
return first;
}
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
const T2& choose(Cond<false>, const T1&, const T2& second) {
return second;
}
template<typename Derived>
struct TensorContractionEvaluatorBase
{
typedef typename internal::traits<Derived>::Indices Indices;
typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
typedef typename internal::traits<Derived>::RightArgType RightArgType;
typedef typename internal::traits<Derived>::Device Device;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Packet Packet;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
IsAligned = true,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
typedef array<Index, LDims> left_dim_mapper_t;
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
typedef DSizes<Index, NumDims> Dimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorContractionEvaluatorBase(const XprType& op, const Device& device)
: m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.lhsExpression(), op.rhsExpression()), device),
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
op.rhsExpression(), op.lhsExpression()), device),
m_device(device),
m_result(NULL) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert((internal::array_size<contract_t>::value > 0) && "Must contract on some indices");
DSizes<Index, LDims> eval_left_dims;
DSizes<Index, RDims> eval_right_dims;
array<IndexPair<Index>, ContractDims> eval_op_indices;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
// For ColMajor, we keep using the existing dimensions
for (int i = 0; i < LDims; i++) {
eval_left_dims[i] = m_leftImpl.dimensions()[i];
}
for (int i = 0; i < RDims; i++) {
eval_right_dims[i] = m_rightImpl.dimensions()[i];
}
// We keep the pairs of contracting indices.
for (int i = 0; i < ContractDims; i++) {
eval_op_indices[i].first = op.indices()[i].first;
eval_op_indices[i].second = op.indices()[i].second;
}
} else {
// For RowMajor, we need to reverse the existing dimensions
for (int i = 0; i < LDims; i++) {
eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
}
for (int i = 0; i < RDims; i++) {
eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
}
// We need to flip all the pairs of contracting indices as well as
// reversing the dimensions.
for (int i = 0; i < ContractDims; i++) {
eval_op_indices[i].first = LDims - 1 - op.indices()[i].second;
eval_op_indices[i].second = RDims - 1 - op.indices()[i].first;
}
}
array<Index, LDims> lhs_strides;
lhs_strides[0] = 1;
for (int i = 0; i < LDims-1; ++i) {
lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
}
array<Index, RDims> rhs_strides;
rhs_strides[0] = 1;
for (int i = 0; i < RDims-1; ++i) {
rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
}
m_i_strides[0] = 1;
m_j_strides[0] = 1;
m_k_strides[0] = 1;
m_i_size = 1;
m_j_size = 1;
m_k_size = 1;
// To compute the dimension, we simply concatenate the non-contracting
// dimensions of the left and then the right tensor. Additionally, we also
// compute the strides corresponding to the left non-contracting
// dimensions and right non-contracting dimensions.
m_lhs_inner_dim_contiguous = true;
int dim_idx = 0;
int nocontract_idx = 0;
for (int i = 0; i < LDims; i++) {
// find if we are contracting on index i of left tensor
bool contracting = false;
for (int j = 0; j < ContractDims; j++) {
if (eval_op_indices[j].first == i) {
contracting = true;
break;
}
}
if (!contracting) {
// add dimension size to output dimensions
m_dimensions[dim_idx] = eval_left_dims[i];
m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
if (dim_idx != i) {
m_lhs_inner_dim_contiguous = false;
}
if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
m_i_strides[nocontract_idx+1] =
m_i_strides[nocontract_idx] * eval_left_dims[i];
} else {
m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
}
dim_idx++;
nocontract_idx++;
}
}
nocontract_idx = 0;
for (int i = 0; i < RDims; i++) {
bool contracting = false;
// find if we are contracting on index i of right tensor
for (int j = 0; j < ContractDims; j++) {
if (eval_op_indices[j].second == i) {
contracting = true;
break;
}
}
if (!contracting) {
m_dimensions[dim_idx] = eval_right_dims[i];
if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
m_j_strides[nocontract_idx+1] =
m_j_strides[nocontract_idx] * eval_right_dims[i];
} else {
m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
}
m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
dim_idx++;
nocontract_idx++;
}
}
// Now compute the strides corresponding to the contracting dimensions. We
// assumed above that non-contracting axes are represented in the same order
// in the matrix as they are in the tensor. This is not the case for
// contracting axes. As the contracting axes must be of the same size in
// each tensor, we'll only look at the first tensor here.
m_rhs_inner_dim_contiguous = true;
m_rhs_inner_dim_reordered = false;
for (int i = 0; i < ContractDims; i++) {
Index left = eval_op_indices[i].first;
Index right = eval_op_indices[i].second;
Index size = eval_left_dims[left];
eigen_assert(size == eval_right_dims[right] &&
"Contraction axes must be same size");
if (i+1 < internal::array_size<contract_t>::value) {
m_k_strides[i+1] = m_k_strides[i] * size;
} else {
m_k_size = m_k_strides[i] * size;
}
m_left_contracting_strides[i] = lhs_strides[left];
m_right_contracting_strides[i] = rhs_strides[right];
if (i > 0 && right < eval_op_indices[i-1].second) {
m_rhs_inner_dim_reordered = true;
}
if (right != i) {
m_rhs_inner_dim_contiguous = false;
}
}
// Scalar case. We represent the result as a 1d tensor of size 1.
if (LDims + RDims == 2 * ContractDims) {
m_dimensions[0] = 1;
}
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
std::swap(m_dimensions[i], m_dimensions[j]);
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
evalTo(data);
return false;
} else {
m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
evalTo(m_result);
return true;
}
}
EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
if (this->m_lhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
}
else {
static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
}
}
else {
if (this->m_rhs_inner_dim_reordered) {
static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
}
else {
static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
}
}
}
else {
if (this->m_rhs_inner_dim_contiguous) {
if (this->m_rhs_inner_dim_reordered) {
static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
}
else {
static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
}
}
else {
if (this->m_rhs_inner_dim_reordered) {
static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
}
else {
static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
}
}
}
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalGemv(Scalar* buffer) const {
const Index rows = m_i_size;
const Index cols = m_k_size;
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
LeftEvaluator, left_nocontract_t,
contract_t, lhs_packet_size,
lhs_inner_dim_contiguous,
false, Unaligned> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
RightEvaluator, right_nocontract_t,
contract_t, rhs_packet_size,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned> RhsMapper;
LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
m_left_contracting_strides, m_k_strides);
RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
m_right_contracting_strides, m_k_strides);
const Scalar alpha(1);
const Index resIncr(1);
// zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
m_device.memset(buffer, 0, rows * sizeof(Scalar));
internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
rows, cols, lhs, rhs,
buffer, resIncr, alpha);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
if (m_result != NULL) {
m_device.deallocate(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_result[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<Packet, LoadMode>(m_result + index);
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
contract_t m_k_strides;
contract_t m_left_contracting_strides;
contract_t m_right_contracting_strides;
bool m_lhs_inner_dim_contiguous;
bool m_rhs_inner_dim_contiguous;
bool m_rhs_inner_dim_reordered;
left_nocontract_t m_i_strides;
right_nocontract_t m_j_strides;
left_nocontract_t m_left_nocontract_strides;
right_nocontract_t m_right_nocontract_strides;
Index m_i_size;
Index m_j_size;
Index m_k_size;
TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
};
// evaluator for default device
template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
public TensorContractionEvaluatorBase<
TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Packet Packet;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, LDims> left_dim_mapper_t;
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
// Could we use NumDimensions here?
typedef DSizes<Index, NumDims> Dimensions;
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) { }
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
return;
}
evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
// columns in left side, rows in right side
const Index k = this->m_k_size;
// rows in left side
const Index m = this->m_i_size;
// columns in right side
const Index n = this->m_j_size;
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
// define mr, nr, and all of my data mapper types
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
const Index nr = Traits::nr;
const Index mr = Traits::mr;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
LeftEvaluator, left_nocontract_t,
contract_t, lhs_packet_size,
lhs_inner_dim_contiguous,
false, Unaligned> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
RightEvaluator, right_nocontract_t,
contract_t, rhs_packet_size,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned> RhsMapper;
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
// Declare GEBP packing and kernel structs
internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
// initialize data mappers
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
this->m_left_contracting_strides, this->m_k_strides);
RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
this->m_right_contracting_strides, this->m_k_strides);
OutputMapper output(buffer, m);
typedef typename internal::gemm_blocking_space<ColMajor, LhsScalar, RhsScalar, Dynamic, Dynamic, Dynamic> BlockingType;
// Sizes of the blocks to load in cache. See the Goto paper for details.
BlockingType blocking(m, n, k, 1, true);
const Index kc = blocking.kc();
const Index mc = (std::min)(m, blocking.mc());
const Index nc = (std::min)(n, blocking.nc());
const Index sizeA = mc * kc;
const Index sizeB = kc * nc;
LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
for(Index i2=0; i2<m; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,m)-i2;
for (Index k2 = 0; k2 < k; k2 += kc) {
// make sure we don't overshoot right edge of left matrix, then pack vertical panel
const Index actual_kc = (std::min)(k2 + kc, k) - k2;
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
// series of horizontal blocks
for (Index j2 = 0; j2 < n; j2 += nc) {
// make sure we don't overshoot right edge of right matrix, then pack block
const Index actual_nc = (std::min)(j2 + nc, n) - j2;
pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
// call gebp (matrix kernel)
// The parameters here are copied from Eigen's GEMM implementation
gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
}
}
}
this->m_device.deallocate(blockA);
this->m_device.deallocate(blockB);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,382 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
// evaluator for thread pool device
#ifdef EIGEN_USE_THREADS
namespace Eigen {
namespace internal {
template<typename LhsScalar, typename LhsMapper, typename Index>
struct packLhsArg {
LhsScalar* blockA;
const LhsMapper& lhs;
const Index m_start;
const Index k_start;
const Index mc;
const Index kc;
};
template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
struct packRhsAndKernelArg {
const std::vector<LhsScalar*>* blockAs;
RhsScalar* blockB;
const RhsMapper& rhs;
OutputMapper& output;
const Index m;
const Index k;
const Index n;
const Index mc;
const Index kc;
const Index nc;
const Index num_threads;
const Index num_blockAs;
const Index max_m;
const Index k_block_idx;
const Index m_block_idx;
const Index n_block_idx;
const Index m_blocks;
const Index n_blocks;
std::vector<Promise>* kernel_promises;
const std::vector<Future>* lhs_futures;
const bool need_to_pack;
};
} // end namespace internal
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
typedef ThreadPoolDevice Device;
typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
typedef TensorContractionEvaluatorBase<Self> Base;
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::Packet Packet;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
};
// Most of the code is assuming that both input tensors are ColMajor. If the
// inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
// If we want to compute A * B = C, where A is LHS and B is RHS, the code
// will pretend B is LHS and A is RHS.
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
typedef typename internal::conditional<
static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
static const int LDims =
internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
static const int RDims =
internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
static const int ContractDims = internal::array_size<Indices>::value;
typedef array<Index, LDims> left_dim_mapper_t;
typedef array<Index, RDims> right_dim_mapper_t;
typedef array<Index, ContractDims> contract_t;
typedef array<Index, internal::max_n_1<LDims - ContractDims>::size> left_nocontract_t;
typedef array<Index, internal::max_n_1<RDims - ContractDims>::size> right_nocontract_t;
static const int NumDims = internal::max_n_1<LDims + RDims - 2 * ContractDims>::size;
typedef DSizes<Index, NumDims> Dimensions;
// typedefs needed in evalTo
typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
if (this->m_j_size == 1) {
this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
return;
}
evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalGemm(Scalar* buffer) const {
// columns in left side, rows in right side
const Index k = this->m_k_size;
// rows in left side
const Index m = this->m_i_size;
// columns in right side
const Index n = this->m_j_size;
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
const int lhs_packet_size = internal::packet_traits<LhsScalar>::size;
const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
LeftEvaluator, left_nocontract_t,
contract_t, lhs_packet_size,
lhs_inner_dim_contiguous,
false, Unaligned> LhsMapper;
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
RightEvaluator, right_nocontract_t,
contract_t, rhs_packet_size,
rhs_inner_dim_contiguous,
rhs_inner_dim_reordered, Unaligned> RhsMapper;
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
// TODO: packing could be faster sometimes if we supported row major tensor mappers
typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
Traits::LhsProgress, ColMajor> LhsPacker;
typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
// TODO: replace false, false with conjugate values?
typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
Traits::mr, Traits::nr, false, false> GebpKernel;
typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
// initialize data mappers
LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
this->m_left_contracting_strides, this->m_k_strides);
RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
this->m_right_contracting_strides, this->m_k_strides);
OutputMapper output(buffer, m);
LhsPacker pack_lhs;
// compute block sizes (which depend on number of threads)
const Index num_threads = this->m_device.numThreads();
Index mc = m;
Index nc = n;
Index kc = k;
internal::computeProductBlockingSizes<LhsScalar,RhsScalar,1>(kc, mc, nc, num_threads);
eigen_assert(mc <= m);
eigen_assert(nc <= n);
eigen_assert(kc <= k);
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
const Index k_blocks = CEIL_DIV(k, kc);
const Index n_blocks = CEIL_DIV(n, nc);
const Index m_blocks = CEIL_DIV(m, mc);
const int sizeA = mc * kc;
const int sizeB = kc * nc;
/* cout << "m: " << m << " n: " << n << " k: " << k << endl;
cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
cout << "num threads: " << num_threads << endl;
*/
// note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
// aren't 16 byte aligned segfaults will happen due to SIMD instructions
// note: You can get away with allocating just a single blockA and offsets and meet the
// the alignment requirements with the assumption that
// (Traits::mr * sizeof(ResScalar)) % 16 == 0
const Index numBlockAs = (std::min)(num_threads, m_blocks);
std::vector<LhsScalar *> blockAs;
blockAs.reserve(num_threads);
for (int i = 0; i < num_threads; i++) {
blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
}
// To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
// TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
// Other options: (1) reuse memory when a thread finishes. con: tricky
// (2) allocate block B memory in each thread. con: overhead
std::vector<RhsScalar *> blockBs;
blockBs.reserve(n_blocks);
for (int i = 0; i < n_blocks; i++) {
blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
}
// lhs_futures starts with all null futures
std::vector<Future> lhs_futures(num_threads);
// this should really be numBlockAs * n_blocks;
const Index num_kernel_promises = num_threads * n_blocks;
std::vector<Promise> kernel_promises(num_kernel_promises);
std::vector<Future> kernel_futures(num_kernel_promises);
for (int i = 0; i < kernel_promises.size(); ++i) {
kernel_promises[i].set_value();
kernel_futures[i] = kernel_promises[i].get_future();
}
for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
const Index k_start = k_block_idx * kc;
// make sure we don't overshoot right edge of left matrix
const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
const Index m_start = mt_block_idx * mc;
const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
eigen_assert(actual_mc > 0);
int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
for (int i = 0; i < n_blocks; ++i) {
int future_id = (blockAId * n_blocks + i);
wait_until_ready(&kernel_futures[future_id]);
kernel_promises[future_id] = Promise();
kernel_futures[future_id] = kernel_promises[future_id].get_future();
}
const packLArg arg = {
blockAs[blockAId], // blockA
lhs, // lhs
m_start, // m
k_start, // k
actual_mc, // mc
actual_kc, // kc
};
lhs_futures[blockAId] =
this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
}
// now start kernels.
const Index m_base_start = m_block_idx * mc;
const bool need_to_pack = m_block_idx == 0;
for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
const Index n_start = n_block_idx * nc;
const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
// first make sure the previous kernels are all done before overwriting rhs. Also wait if
// we're going to start new k. In both cases need_to_pack is true.
if (need_to_pack) {
for (int i = num_blocks; i < num_threads; ++i) {
int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
int future_id = (blockAId * n_blocks + n_block_idx);
wait_until_ready(&kernel_futures[future_id]);
}
}
packRKArg arg = {
&blockAs, // blockA
blockBs[n_block_idx], // blockB
rhs, // rhs
output, // output
m_base_start, // m
k_start, // k
n_start, // n
mc, // mc
actual_kc, // kc
actual_nc, // nc
num_threads,
numBlockAs,
m,
k_block_idx,
m_block_idx,
n_block_idx, // n_block_idx
m_blocks, // m_blocks
n_blocks, // n_blocks
&kernel_promises, // kernel_promises
&lhs_futures, // lhs_futures
need_to_pack, // need_to_pack
};
this->m_device.enqueueNoFuture(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
}
}
}
// Make sure all the kernels are done.
for (int i = 0; i < kernel_futures.size(); ++i) {
wait_until_ready(&kernel_futures[i]);
}
// deallocate all of the memory for both A and B's
for (int i = 0; i < blockAs.size(); i++) {
this->m_device.deallocate(blockAs[i]);
}
for (int i = 0; i < blockBs.size(); i++) {
this->m_device.deallocate(blockBs[i]);
}
#undef CEIL_DIV
}
/*
* Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
* the LHS block, check that all of the kernels that worked on the same
* mt_block_idx in the previous m_block are done.
*/
template <typename packLArg, typename LhsPacker>
static void packLhs(const packLArg arg) {
// perform actual packing
LhsPacker pack_lhs;
pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
}
/*
* Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
* all kernels in the previous block are done.
* Then for each LHS future, we wait on the future and then call GEBP
* on the area packed by the future (which starts at
* blockA + future_idx * mt * kc) on the LHS and with the full packed
* RHS block.
* The output of this GEBP is written to output(m + i * mt, n).
*/
template <typename packRKArg, typename RhsPacker, typename GebpKernel>
static void packRhsAndKernel(packRKArg arg) {
if (arg.need_to_pack) {
RhsPacker pack_rhs;
pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
}
GebpKernel gebp;
for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
const Index m_base_start = arg.m + arg.mc*mt_block_idx;
if (m_base_start < arg.max_m) {
int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
wait_until_ready(&(*arg.lhs_futures)[blockAId]);
const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
gebp(arg.output.getSubMapper(m_base_start, arg.n),
(*arg.blockAs)[blockAId], arg.blockB,
actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
(*arg.kernel_promises)[set_idx].set_value();
}
}
}
};
} // end namespace Eigen
#endif // EIGEN_USE_THREADS
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H

View File

@ -0,0 +1,912 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
namespace Eigen {
/** \class TensorConvolution
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor convolution class.
*
*
*/
namespace internal {
template <typename Index, typename InputDims, size_t NumKernelDims> class IndexMapper {
public:
IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
const array<Index, NumKernelDims>& indices) {
array<Index, NumDims> dimensions = input_dims;
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = indices[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
dimensions[index] = result_dim;
}
array<Index, NumDims> inputStrides;
array<Index, NumDims> outputStrides;
for (int i = 0; i < NumDims; ++i) {
if (i > 0) {
inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
} else {
inputStrides[0] = 1;
outputStrides[0] = 1;
}
}
array<Index, NumDims> cudaInputDimensions;
array<Index, NumDims> cudaOutputDimensions;
array<Index, NumDims> tmp = dimensions;
array<Index, NumDims> ordering;
for (int i = 0; i < NumKernelDims; ++i) {
ordering[i] = indices[i];
tmp[indices[i]] = -1;
cudaInputDimensions[i] = input_dims[ordering[i]];
cudaOutputDimensions[i] = dimensions[ordering[i]];
}
int written = NumKernelDims;
for (int i = 0; i < NumDims; ++i) {
if (tmp[i] >= 0) {
ordering[written] = i;
cudaInputDimensions[written] = input_dims[i];
cudaOutputDimensions[written] = dimensions[i];
++written;
}
}
for (int i = 0; i < NumDims; ++i) {
m_inputStrides[i] = inputStrides[ordering[i]];
m_outputStrides[i] = outputStrides[ordering[i]];
}
for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) {
m_cudaInputStrides[i] = m_cudaInputStrides[i-1] * cudaInputDimensions[i-1];
m_cudaOutputStrides[i] = m_cudaOutputStrides[i-1] * cudaOutputDimensions[i-1];
} else {
m_cudaInputStrides[i] = 1;
m_cudaOutputStrides[i] = 1;
}
}
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
Index inputIndex = 0;
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaInputStrides[d];
inputIndex += idx * m_inputStrides[d];
p -= idx * m_cudaInputStrides[d];
}
inputIndex += p * m_inputStrides[NumKernelDims];
return inputIndex;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
Index outputIndex = 0;
for (int d = NumDims - 1; d > NumKernelDims; --d) {
const Index idx = p / m_cudaOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
p -= idx * m_cudaOutputStrides[d];
}
outputIndex += p * m_outputStrides[NumKernelDims];
return outputIndex;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
return i * m_inputStrides[0];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
return i * m_outputStrides[0];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
return i * m_inputStrides[0] + j*m_inputStrides[1];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
return i * m_outputStrides[0] + j * m_outputStrides[1];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
return i * m_inputStrides[0] + j*m_inputStrides[1] + k*m_inputStrides[2];
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
return i * m_outputStrides[0] + j*m_outputStrides[1] + k*m_outputStrides[2];
}
private:
static const size_t NumDims = internal::array_size<InputDims>::value;
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_cudaInputStrides;
array<Index, NumDims> m_cudaOutputStrides;
};
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename promote_storage_type<typename InputXprType::Scalar,
typename KernelXprType::Scalar>::ret Scalar;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
typename traits<KernelXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<InputXprType>::Index,
typename traits<KernelXprType>::Index>::type Index;
typedef typename InputXprType::Nested LhsNested;
typedef typename KernelXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<InputXprType>::NumDimensions;
static const int Layout = traits<InputXprType>::Layout;
enum {
Flags = 0,
};
};
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
{
typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
};
template<typename Dimensions, typename InputXprType, typename KernelXprType>
struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
{
typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
};
} // end namespace internal
template<typename Indices, typename InputXprType, typename KernelXprType>
class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
{
public:
typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
typename KernelXprType::PacketReturnType>::ret PacketReturnType;
typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
: m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Indices& indices() const { return m_indices; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<typename InputXprType::Nested>::type&
inputExpression() const { return m_input_xpr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<typename KernelXprType::Nested>::type&
kernelExpression() const { return m_kernel_xpr; }
protected:
typename InputXprType::Nested m_input_xpr;
typename KernelXprType::Nested m_kernel_xpr;
const Indices m_indices;
};
template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
{
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
static const int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
m_inputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
}
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
if (i > 0) {
m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
} else {
m_kernelStride[0] = 1;
}
m_indexStride[i] = m_inputStride[index];
}
m_outputStride[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
}
}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_inputImpl.evalSubExprsIfNeeded(NULL);
preloadKernel();
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_local_kernel) {
m_device.deallocate((void*)m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
void evalTo(typename XprType::Scalar* buffer) {
evalSubExprsIfNeeded(NULL);
for (int i = 0; i < dimensions().TotalSize(); ++i) {
buffer[i] += coeff(i);
}
cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
CoeffReturnType result = CoeffReturnType(0);
convolve(firstInput(index), 0, NumKernelDims-1, result);
return result;
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
{
const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
Index indices[2] = {index, index+PacketSize-1};
Index startInputs[2] = {0, 0};
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStride[i];
const Index idx1 = indices[1] / m_outputStride[i];
startInputs[0] += idx0 * m_inputStride[i];
startInputs[1] += idx1 * m_inputStride[i];
indices[0] -= idx0 * m_outputStride[i];
indices[1] -= idx1 * m_outputStride[i];
}
startInputs[0] += indices[0];
startInputs[1] += indices[1];
if (startInputs[1]-startInputs[0] == PacketSize-1) {
PacketReturnType result = internal::pset1<PacketReturnType>(0);
convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
return result;
} else {
EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
data[0] = Scalar(0);
convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
for (int i = 1; i < PacketSize-1; ++i) {
data[i] = Scalar(0);
convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
}
data[PacketSize-1] = Scalar(0);
convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
return internal::pload<PacketReturnType>(data);
}
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
Index startInput = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStride[i];
startInput += idx * m_inputStride[i];
index -= idx * m_outputStride[i];
}
startInput += index;
return startInput;
}
EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
const Index input = firstIndex + j * m_indexStride[DimIndex];
const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
if (DimIndex > 0) {
convolve(input, kernel, DimIndex-1, accum);
} else {
accum += m_inputImpl.coeff(input) * m_kernel[kernel];
}
}
}
template <typename Packet>
EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
const Index input = firstIndex + j * m_indexStride[DimIndex];
const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
if (DimIndex > 0) {
convolvePacket(input, kernel, DimIndex-1, accum);
} else {
accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
}
}
}
EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
// expression that needs to be evaluated)
const Scalar* in_place = m_kernelImpl.data();
if (in_place) {
m_kernel = in_place;
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
}
}
array<Index, NumDims> m_inputStride;
array<Index, NumDims> m_outputStride;
array<Index, NumKernelDims> m_indexStride;
array<Index, NumKernelDims> m_kernelStride;
TensorEvaluator<InputArgType, Device> m_inputImpl;
TensorEvaluator<KernelArgType, Device> m_kernelImpl;
Dimensions m_dimensions;
KernelArgType m_kernelArg;
const Scalar* m_kernel;
bool m_local_kernel;
const Device& m_device;
};
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <int StaticKernelSize>
struct GetKernelSize {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
return StaticKernelSize;
}
};
template <>
struct GetKernelSize<Dynamic> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
return kernelSize;
}
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
__global__ void EigenConvolutionKernel1D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) {
extern __shared__ float s[];
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
const int num_x_output = last_x - first_x + 1;
const int first_plane = blockIdx.y * blockDim.y;
const int plane_stride = blockDim.y * gridDim.y;
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
// Load inputs to shared memory
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.y * num_x_input;
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
}
__syncthreads();
// Compute the convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
const int kernel_offset = plane_kernel_offset + i;
float result = 0.0f;
#pragma unroll
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
result += s[k + kernel_offset] * kernel[k];
}
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
buffer[tensor_index] = result;
}
__syncthreads();
}
};
template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
__global__ void EigenConvolutionKernel2D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2> indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) {
extern __shared__ float s[];
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
const int num_x_output = last_x - first_x + 1;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
const int num_y_output = last_y - first_y + 1;
const int first_plane = blockIdx.z * blockDim.z;
const int plane_stride = blockDim.z * gridDim.z;
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.z * num_y_input;
// Load inputs to shared memory
#pragma unroll
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
const int input_offset = num_x_input * (j + plane_kernel_offset);
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
s[i + input_offset] = eval.coeff(tensor_index);
}
}
__syncthreads();
// Convolution
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
#pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
#pragma unroll
for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
const int kernel_offset = kernelSizeX * l;
const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
#pragma unroll
for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
result += s[k + input_offset] * kernel[k + kernel_offset];
}
}
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
buffer[tensor_index] = result;
}
}
__syncthreads();
}
};
template <typename InputEvaluator, typename Index, typename InputDims>
__global__ void EigenConvolutionKernel3D(InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3> indexMapper, const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) {
extern __shared__ float s[];
// Load inputs to shared memory
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + kernelSizeX;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + kernelSizeY;
const int first_z = blockIdx.z * maxZ;
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const int num_z_input = last_z - first_z + kernelSizeZ;
for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
}
}
}
__syncthreads();
// Convolution
const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
for (int n = 0; n < kernelSizeZ; ++n) {
for (int m = 0; m < kernelSizeY; ++m) {
for (int l = 0; l < kernelSizeX; ++l) {
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
}
}
}
const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
buffer[tensor_index] = result;
}
}
}
__syncthreads();
}
};
template<typename Indices, typename InputArgType, typename KernelArgType>
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
{
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
static const int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
}
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename InputArgType::Scalar Scalar;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
preloadKernel();
m_inputImpl.evalSubExprsIfNeeded(NULL);
if (data) {
executeEval(data);
return false;
} else {
m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
executeEval(m_buf);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_buf) {
m_device.deallocate(m_buf);
m_buf = NULL;
}
if (m_local_kernel) {
m_device.deallocate((void*)m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
// expression that needs to be evaluated)
const Scalar* in_place = m_kernelImpl.data();
if (in_place) {
m_kernel = in_place;
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
internal::TensorExecutor<const EvalTo, GpuDevice, TensorEvaluator<KernelArgType, GpuDevice>::PacketAccess>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
}
}
static unsigned int ceil(unsigned int num, unsigned int denom) {
const unsigned int rounded_toward_zero = num / denom;
if (num > rounded_toward_zero * denom) {
return rounded_toward_zero + 1;
}
return rounded_toward_zero;
}
void executeEval(Scalar* data) const {
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
const int maxSharedMem = sharedMemPerBlock();
const int maxThreadsPerBlock = maxCudaThreadsPerBlock();
const int maxBlocksPerProcessor = maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
const int numMultiProcessors = getNumCudaMultiProcessors();
const int warpSize = 32;
switch (NumKernelDims) {
case 1: {
const int kernel_size = m_kernelImpl.dimensions().TotalSize();
const int numX = dimensions()[m_indices[0]];
const int numP = dimensions().TotalSize() / numX;
int maxX;
dim3 block_size;
if (m_indices[0] == 0) {
// Maximum the reuse
const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
maxX = (std::min<int>)(inner_dim, numX);
const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
block_size.x = (std::min)(maxThreadsPerBlock, maxX);
block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
}
else {
// Read as much as possible alongside the inner most dimension, that is the plane
const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
const int maxP = (std::min<int>)(inner_dim, numP);
maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
block_size.x = (std::min)(warpSize, maxX);
block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
}
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
dim3 num_blocks(num_x_blocks, min<int>(num_y_blocks, ceil(numP, block_size.y)));
//cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 1> indices(m_indices[0]);
const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
internal::IndexMapper<Index, InputDims, 1> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
switch(kernel_size) {
case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
break;
}
case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
break;
}
default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
}
}
break;
}
case 2: {
const int kernel_size_x = m_kernelImpl.dimensions()[0];
const int kernel_size_y = m_kernelImpl.dimensions()[1];
const int numX = dimensions()[m_indices[0]];
const int numY = dimensions()[m_indices[1]];
const int numP = dimensions().TotalSize() / (numX*numY);
const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
// Snap maxX to warp size
int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
const int maxX = (std::min<int>)(inner_dim, numX);
const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
dim3 block_size;
block_size.x = (std::min)(1024, maxX);
block_size.y = (std::min<int>)(1024/block_size.x, maxY);
block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int num_y_blocks = ceil(numY, maxY);
const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
dim3 num_blocks(num_x_blocks, num_y_blocks, min<int>(num_z_blocks, ceil(numP, block_size.z)));
//cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 2> indices(m_indices[0], m_indices[1]);
const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1]);
internal::IndexMapper<Index, InputDims, 2> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
switch (kernel_size_x) {
case 4: {
switch (kernel_size_y) {
case 7: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
break;
}
default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
break;
}
}
break;
}
case 7: {
switch (kernel_size_y) {
case 4: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
break;
}
default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
break;
}
}
break;
}
default: {
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
break;
}
}
break;
}
case 3: {
const int kernel_size_x = m_kernelImpl.dimensions()[0];
const int kernel_size_y = m_kernelImpl.dimensions()[1];
const int kernel_size_z = m_kernelImpl.dimensions()[2];
const int numX = dimensions()[m_indices[0]];
const int numY = dimensions()[m_indices[1]];
const int numZ = dimensions()[m_indices[2]];
const int numP = dimensions().TotalSize() / (numX*numY*numZ);
const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
dim3 block_size;
block_size.x = (std::min)(32, maxX);
block_size.y = (std::min)(32, maxY);
block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
assert(shared_mem <= maxSharedMem);
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 3> indices(m_indices[0], m_indices[1], m_indices[2]);
const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[0], m_kernelImpl.dimensions()[1], m_kernelImpl.dimensions()[2]);
internal::IndexMapper<Index, InputDims, 3> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break;
}
default: {
assert(false && "not supported yet");
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
eigen_assert(m_buf);
eigen_assert(index < m_dimensions.TotalSize());
return m_buf[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
{
eigen_assert(m_buf);
eigen_assert(index < m_dimensions.TotalSize());
return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
}
private:
// No assignment (copies are needed by the kernels)
TensorEvaluator& operator = (const TensorEvaluator&);
TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
KernelArgType m_kernelArg;
Indices m_indices;
Dimensions m_dimensions;
Scalar* m_buf;
const Scalar* m_kernel;
bool m_local_kernel;
const GpuDevice& m_device;
};
#endif
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H

View File

@ -0,0 +1,126 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
namespace Eigen {
/** \class TensorDevice
* \ingroup CXX11_Tensor_Module
*
* \brief Pseudo expression providing an operator = that will evaluate its argument
* on the specified computing 'device' (GPU, thread pool, ...)
*
* Example:
* C.device(EIGEN_GPU) = A + B;
*
* Todo: thread pools.
* Todo: operator +=, -=, *= and so on.
*/
template <typename ExpressionType, typename DeviceType> class TensorDevice {
public:
TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
return *this;
}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
typedef typename OtherDerived::Scalar Scalar;
typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
static const bool Vectorize = TensorEvaluator<const Assign, DeviceType>::PacketAccess;
internal::TensorExecutor<const Assign, DeviceType, Vectorize>::run(assign, m_device);
return *this;
}
protected:
const DeviceType& m_device;
ExpressionType& m_expression;
};
#ifdef EIGEN_USE_THREADS
template <typename ExpressionType> class TensorDevice<ExpressionType, ThreadPoolDevice> {
public:
TensorDevice(const ThreadPoolDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
return *this;
}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
typedef typename OtherDerived::Scalar Scalar;
typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
static const bool Vectorize = TensorEvaluator<const Assign, ThreadPoolDevice>::PacketAccess;
internal::TensorExecutor<const Assign, ThreadPoolDevice, Vectorize>::run(assign, m_device);
return *this;
}
protected:
const ThreadPoolDevice& m_device;
ExpressionType& m_expression;
};
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <typename ExpressionType> class TensorDevice<ExpressionType, GpuDevice>
{
public:
TensorDevice(const GpuDevice& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
return *this;
}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
typedef typename OtherDerived::Scalar Scalar;
typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
internal::TensorExecutor<const Assign, GpuDevice, false>::run(assign, m_device);
return *this;
}
protected:
const GpuDevice& m_device;
ExpressionType m_expression;
};
#endif
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H

View File

@ -0,0 +1,190 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H
namespace Eigen {
// Default device for the machine (typically a single cpu core)
struct DefaultDevice {
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
internal::aligned_free(buffer);
}
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
EIGEN_STRONG_INLINE size_t numThreads() const {
return 1;
}
};
// Multiple cpu cores
// We should really use a thread pool here but first we need to find a portable thread pool library.
#ifdef EIGEN_USE_THREADS
typedef std::future<void> Future;
typedef std::promise<void> Promise;
static EIGEN_STRONG_INLINE void wait_until_ready(const Future* f) {
f->wait();
}
static EIGEN_STRONG_INLINE void get_when_ready(Future* f) {
f->get();
}
struct ThreadPoolDevice {
ThreadPoolDevice(size_t num_cores) : num_threads_(num_cores) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
internal::aligned_free(buffer);
}
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
EIGEN_STRONG_INLINE size_t numThreads() const {
return num_threads_;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE Future enqueue(Function&& f, Args&&... args) const {
return std::async(std::launch::async, f, args...);
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoFuture(Function&& f, Args&&... args) const {
std::async(std::launch::async, f, args...);
}
private:
size_t num_threads_;
};
#endif
// GPU offloading
#ifdef EIGEN_USE_GPU
static cudaDeviceProp m_deviceProperties;
static bool m_devicePropInitialized = false;
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
assert(cudaGetDeviceProperties(&m_deviceProperties, 0) == cudaSuccess);
m_devicePropInitialized = true;
}
}
static inline int getNumCudaMultiProcessors() {
initializeDeviceProp();
return m_deviceProperties.multiProcessorCount;
}
static inline int maxCudaThreadsPerBlock() {
initializeDeviceProp();
return m_deviceProperties.maxThreadsPerBlock;
}
static inline int maxCudaThreadsPerMultiProcessor() {
initializeDeviceProp();
return m_deviceProperties.maxThreadsPerMultiProcessor;
}
static inline int sharedMemPerBlock() {
initializeDeviceProp();
return m_deviceProperties.sharedMemPerBlock;
}
static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
cudaError_t status = cudaDeviceSetSharedMemConfig(config);
assert(status == cudaSuccess);
}
struct GpuDevice {
// The cudastream is not owned: the caller is responsible for its initialization and eventual destruction.
GpuDevice(const cudaStream_t* stream) : stream_(stream) { eigen_assert(stream); }
EIGEN_STRONG_INLINE const cudaStream_t& stream() const { return *stream_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
#ifndef __CUDA_ARCH__
void* result;
assert(cudaMalloc(&result, num_bytes) == cudaSuccess);
assert(result != NULL);
return result;
#else
assert(false && "The default device should be used instead to generate kernel code");
return NULL;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
#ifndef __CUDA_ARCH__
assert(buffer != NULL);
assert(cudaFree(buffer) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef __CUDA_ARCH__
assert(cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, *stream_) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef __CUDA_ARCH__
assert(cudaMemsetAsync(buffer, c, n, *stream_) == cudaSuccess);
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
cudaStreamSynchronize(*stream_);
}
private:
// TODO: multigpu.
const cudaStream_t* stream_;
};
#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
assert(cudaGetLastError() == cudaSuccess);
#endif
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H

View File

@ -0,0 +1,380 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
namespace Eigen {
/** \internal
*
* \class TensorDimensions
* \ingroup CXX11_Tensor_Module
*
* \brief Set of classes used to encode and store the dimensions of a Tensor.
*
* The Sizes class encodes as part of the type the number of dimensions and the
* sizes corresponding to each dimension. It uses no storage space since it is
* entirely known at compile time.
* The DSizes class is its dynamic sibling: the number of dimensions is known
* at compile time but the sizes are set during execution.
*
* \sa Tensor
*/
// Can't use std::pair on cuda devices
template <typename Index> struct IndexPair {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
Index first;
Index second;
};
// Boilerplate code
namespace internal {
template<std::size_t n, typename Dimension> struct dget {
static const std::size_t value = get<n, Dimension>::value;
};
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
struct fixed_size_tensor_index_linearization_helper
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static inline Index run(array<Index, NumIndices> const& indices,
const Dimensions& dimensions)
{
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
dget<RowMajor ? n : (NumIndices - n - 1), Dimensions>::value *
fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::size_t NumIndices, bool RowMajor>
struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static inline Index run(array<Index, NumIndices> const& indices,
const Dimensions&)
{
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
// Fixed size
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::size_t... Indices>
struct Sizes : internal::numeric_list<std::size_t, Indices...> {
typedef internal::numeric_list<std::size_t, Indices...> Base;
static const std::size_t total_size = internal::arg_prod(Indices...);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
return Base::count;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t TotalSize() {
return internal::arg_prod(Indices...);
}
Sizes() { }
template <typename DenseIndex>
explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex...) { }
explicit Sizes(std::initializer_list<std::size_t> /*l*/) {
// todo: add assertion
}
#endif
template <typename T> Sizes& operator = (const T& /*other*/) {
// add assertion failure if the size of other is different
return *this;
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this));
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this));
}
};
template <typename std::size_t... Indices>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<Indices...>&) {
return Sizes<Indices...>::total_size;
}
#else
template <std::size_t n>
struct non_zero_size {
typedef internal::type2val<std::size_t, n> type;
};
template <>
struct non_zero_size<0> {
typedef internal::null_type type;
};
template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
static const size_t count = Base::count;
static const std::size_t total_size = internal::arg_prod<Base>::value;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
return count;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
return internal::arg_prod<Base>::value;
}
Sizes() { }
template <typename DenseIndex>
explicit Sizes(const array<DenseIndex, Base::count>& indices) {
// todo: add assertion
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex... indices) { }
explicit Sizes(std::initializer_list<std::size_t> l) {
// todo: add assertion
}
#else
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
}
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
}
#endif
template <typename T> Sizes& operator = (const T& other) {
// to do: check the size of other
return *this;
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, false>::run(indices, *static_cast<const Base*>(this);
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count - 1, true>::run(indices, *static_cast<const Base*>(this);
}
};
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
return Sizes<V1, V2, V3, V4, V5>::total_size;
};
#endif
// Boilerplate
namespace internal {
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
struct tensor_index_linearization_helper
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
{
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::size_t NumIndices, bool RowMajor>
struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
{
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
// Dynamic size
template <typename DenseIndex, std::size_t NumDims>
struct DSizes : array<DenseIndex, NumDims> {
typedef array<DenseIndex, NumDims> Base;
static const std::size_t count = NumDims;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
return NumDims;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() const {
return internal::array_prod(*static_cast<const Base*>(this));
}
EIGEN_DEVICE_FUNC DSizes() {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = 0;
}
}
EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, IndexTypes... otherDimensions) {
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
(*this) = array<DenseIndex, NumDims>{{firstDimension, otherDimensions...}};
}
#else
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
eigen_assert(NumDims == 1);
(*this)[0] = i0;
}
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
eigen_assert(NumDims == 2);
(*this)[0] = i0;
(*this)[1] = i1;
}
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
eigen_assert(NumDims == 3);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
}
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
eigen_assert(NumDims == 4);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
}
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
eigen_assert(NumDims == 5);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
(*this)[4] = i4;
}
#endif
EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
*static_cast<Base*>(this) = other;
return *this;
}
// A constexpr would be so much better here
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
}
};
// Boilerplate
namespace internal {
template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
struct tensor_vsize_index_linearization_helper
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
{
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::size_t NumIndices, bool RowMajor>
struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
{
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
namespace internal {
template <typename DenseIndex, std::size_t NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
static const size_t value = NumDims;
};
template <typename DenseIndex, std::size_t NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
static const size_t value = NumDims;
};
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::size_t... Indices> struct array_size<const Sizes<Indices...> > {
static const size_t value = Sizes<Indices...>::count;
};
template <typename std::size_t... Indices> struct array_size<Sizes<Indices...> > {
static const size_t value = Sizes<Indices...>::count;
};
template <std::size_t n, typename std::size_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<Indices...>&) {
return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
}
#else
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
};
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
};
template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>& a) {
return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
};
#endif
template <typename Dims1, typename Dims2, size_t n>
struct sizes_match_up_to_dim {
static inline bool run(Dims1& dims1, Dims2& dims2) {
return (array_get<n>(dims1) == array_get<n>(dims2)) &
sizes_match_up_to_dim<Dims1, Dims2, n-1>::run(dims1, dims2);
}
};
template <typename Dims1, typename Dims2>
struct sizes_match_up_to_dim<Dims1, Dims2, 0> {
static inline bool run(Dims1& dims1, Dims2& dims2) {
return (array_get<0>(dims1) == array_get<0>(dims2));
}
};
} // end namespace internal
template <typename Dims1, typename Dims2>
bool dimensions_match(Dims1& dims1, Dims2& dims2) {
if (internal::array_size<Dims1>::value != internal::array_size<Dims2>::value) {
return false;
}
return internal::sizes_match_up_to_dim<Dims1, Dims2, internal::array_size<Dims1>::value-1>::run(dims1, dims2);
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H

View File

@ -0,0 +1,154 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
namespace Eigen {
/** \class TensorForcedEval
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reshaping class.
*
*
*/
namespace internal {
template<typename XprType>
struct traits<TensorEvalToOp<XprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
enum {
Flags = 0,
};
};
template<typename XprType>
struct eval<TensorEvalToOp<XprType>, Eigen::Dense>
{
typedef const TensorEvalToOp<XprType>& type;
};
template<typename XprType>
struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType> >::type>
{
typedef TensorEvalToOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorEvalToOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(CoeffReturnType* buffer, const XprType& expr)
: m_xpr(expr), m_buffer(buffer) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC CoeffReturnType* buffer() const { return m_buffer; }
protected:
typename XprType::Nested m_xpr;
CoeffReturnType* m_buffer;
};
template<typename ArgType, typename Device>
struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
{
typedef TensorEvalToOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename ArgType::Packet Packet;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
enum {
IsAligned = true,
PacketAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device), m_buffer(op.buffer())
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
typedef typename XprType::Index Index;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
m_buffer[i] = m_impl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_buffer[index];
}
template<int LoadMode>
EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return internal::ploadt<Packet, LoadMode>(m_buffer + index);
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;
CoeffReturnType* m_buffer;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H

View File

@ -0,0 +1,427 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
namespace Eigen {
/** \class TensorEvaluator
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor evaluator classes.
*
* These classes are responsible for the evaluation of the tensor expression.
*
* TODO: add support for more types of expressions, in particular expressions
* leading to lvalues (slicing, reshaping, etc...)
*/
// Generic evaluator
template<typename Derived, typename Device>
struct TensorEvaluator
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Packet Packet;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename Derived::Packet PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = Derived::PacketAccess,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(const_cast<Scalar*>(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
if (dest) {
m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
return false;
}
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data);
return m_data[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
eigen_assert(m_data);
return m_data[index];
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
return internal::ploadt<Packet, LoadMode>(m_data + index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const Packet& x)
{
return internal::pstoret<Scalar, Packet, StoreMode>(m_data + index, x);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
return m_data[m_dims.IndexOfRowMajor(coords)];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
eigen_assert(m_data);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
return m_data[m_dims.IndexOfRowMajor(coords)];
}
}
EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
protected:
Scalar* m_data;
Dimensions m_dims;
const Device& m_device;
};
// Default evaluator for rvalues
template<typename Derived, typename Device>
struct TensorEvaluator<const Derived, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Packet Packet;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename Derived::Packet PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = Derived::PacketAccess,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device&)
: m_data(m.data()), m_dims(m.dimensions())
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data);
#ifdef __CUDA_ARCH__
return __ldg(m_data+index);
#else
return m_data[index];
#endif
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
return internal::ploadt_ro<Packet, LoadMode>(m_data + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data);
const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
: m_dims.IndexOfRowMajor(coords);
#ifdef __CUDA_ARCH__
return __ldg(m_data+index);
#else
return m_data[index];
#endif
}
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_data; }
protected:
const Scalar* m_data;
Dimensions m_dims;
};
// -------------------- CwiseNullaryOp --------------------
template<typename NullaryOp, typename ArgType, typename Device>
struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
{
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
enum {
IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename internal::traits<XprType>::Packet PacketReturnType;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(index);
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
// -------------------- CwiseUnaryOp --------------------
template<typename UnaryOp, typename ArgType, typename Device>
struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
{
typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()),
m_argImpl(op.nestedExpression(), device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename internal::traits<XprType>::Packet PacketReturnType;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_argImpl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_argImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(m_argImpl.coeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
// -------------------- CwiseBinaryOp --------------------
template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
{
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum {
IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
internal::functor_traits<BinaryOp>::PacketAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()),
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename internal::traits<XprType>::Packet PacketReturnType;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use right impl instead if right impl dimensions are known at compile time.
return m_leftImpl.dimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
// -------------------- SelectOp --------------------
template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
{
typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
enum {
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess/* &
TensorEvaluator<IfArgType>::PacketAccess*/,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_condImpl(op.ifExpression(), device),
m_thenImpl(op.thenExpression(), device),
m_elseImpl(op.elseExpression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename internal::traits<XprType>::Packet PacketReturnType;
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use then or else impl instead if they happen to be known at compile time.
return m_condImpl.dimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
m_condImpl.evalSubExprsIfNeeded(NULL);
m_thenImpl.evalSubExprsIfNeeded(NULL);
m_elseImpl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_condImpl.cleanup();
m_thenImpl.cleanup();
m_elseImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
internal::Selector<PacketSize> select;
for (Index i = 0; i < PacketSize; ++i) {
select.select[i] = m_condImpl.coeff(index+i);
}
return internal::pblend(select,
m_thenImpl.template packet<LoadMode>(index),
m_elseImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
TensorEvaluator<ThenArgType, Device> m_thenImpl;
TensorEvaluator<ElseArgType, Device> m_elseImpl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H

View File

@ -0,0 +1,244 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
namespace Eigen {
/** \class TensorExecutor
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor executor class.
*
* This class is responsible for launch the evaluation of the expression on
* the specified computing device.
*/
namespace internal {
template <typename Device, typename Expression>
struct IsVectorizable {
static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
};
// Default strategy: the expression is evaluated with a single cpu thread.
template<typename Expression, typename Device = DefaultDevice, bool Vectorizable = IsVectorizable<Device, Expression>::value>
class TensorExecutor
{
public:
typedef typename Expression::Index Index;
EIGEN_DEVICE_FUNC
static inline void run(const Expression& expr, const Device& device = Device())
{
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
for (Index i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
template<typename Expression>
class TensorExecutor<Expression, DefaultDevice, true>
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
{
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
const Index VectorizedSize = (size / PacketSize) * PacketSize;
for (Index i = 0; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (Index i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
#ifdef EIGEN_USE_THREADS
template <typename Evaluator, typename Index, bool Vectorizable = Evaluator::PacketAccess>
struct EvalRange {
static void run(Evaluator evaluator, const Index first, const Index last) {
eigen_assert(last > first);
for (Index i = first; i < last; ++i) {
evaluator.evalScalar(i);
}
}
};
template <typename Evaluator, typename Index>
struct EvalRange<Evaluator, Index, true> {
static void run(Evaluator evaluator, const Index first, const Index last) {
eigen_assert(last > first);
Index i = first;
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
if (last - first > PacketSize) {
eigen_assert(first % PacketSize == 0);
Index lastPacket = last - (last % PacketSize);
for (; i < lastPacket; i += PacketSize) {
evaluator.evalPacket(i);
}
}
for (; i < last; ++i) {
evaluator.evalScalar(i);
}
}
};
template<typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
{
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const Index size = array_prod(evaluator.dimensions());
static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
const Index numblocks = size / blocksize;
Index i = 0;
std::vector<Future> results;
results.reserve(numblocks);
for (int i = 0; i < numblocks; ++i) {
results.push_back(device.enqueue(&EvalRange<Evaluator, Index>::run, evaluator, i*blocksize, (i+1)*blocksize));
}
if (numblocks * blocksize < size) {
EvalRange<Evaluator, Index>::run(evaluator, numblocks * blocksize, size);
}
for (int i = 0; i < numblocks; ++i) {
get_when_ready(&results[i]);
}
}
evaluator.cleanup();
}
};
#endif
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
// Use the scalar path
for (Index i = first_index; i < size; i += step_size) {
eval.evalScalar(i);
}
}
template <typename Evaluator, typename Index>
__global__ void
__launch_bounds__(1024)
EigenMetaKernel_Vectorizable(Evaluator eval, Index size) {
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
const Index step_size = blockDim.x * gridDim.x;
// Use the vector path
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const Index vectorized_step_size = step_size * PacketSize;
const Index vectorized_size = (size / PacketSize) * PacketSize;
for (Index i = first_index * PacketSize; i < vectorized_size;
i += vectorized_step_size) {
eval.evalPacket(i);
}
for (Index i = vectorized_size + first_index; i < size; i += step_size) {
eval.evalScalar(i);
}
}
template <typename Expression>
struct IsVectorizable<GpuDevice, Expression> {
static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
};
template<typename Expression>
class TensorExecutor<Expression, GpuDevice, false>
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device)
{
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
const int block_size = maxCudaThreadsPerBlock();
const Index size = array_prod(evaluator.dimensions());
LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
};
template<typename Expression>
class TensorExecutor<Expression, GpuDevice, true>
{
public:
typedef typename Expression::Index Index;
static inline void run(const Expression& expr, const GpuDevice& device)
{
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign)
{
const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock();
const int block_size = maxCudaThreadsPerBlock();
const Index size = array_prod(evaluator.dimensions());
LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H

View File

@ -0,0 +1,300 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
namespace Eigen {
/** \class TensorExpr
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor expression classes.
*
* The TensorCwiseNullaryOp class applies a nullary operators to an expression.
* This is typically used to generate constants.
*
* The TensorCwiseUnaryOp class represents an expression where a unary operator
* (e.g. cwiseSqrt) is applied to an expression.
*
* The TensorCwiseBinaryOp class represents an expression where a binary
* operator (e.g. addition) is applied to a lhs and a rhs expression.
*
*/
namespace internal {
template<typename NullaryOp, typename XprType>
struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
: traits<XprType>
{
typedef typename XprType::Packet Packet;
typedef traits<XprType> XprTraits;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::Nested XprTypeNested;
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
enum {
Flags = 0,
};
};
} // end namespace internal
template<typename NullaryOp, typename XprType>
class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
: m_xpr(xpr), m_functor(func) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
nestedExpression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
const NullaryOp& functor() const { return m_functor; }
protected:
typename XprType::Nested m_xpr;
const NullaryOp m_functor;
};
namespace internal {
template<typename UnaryOp, typename XprType>
struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
: traits<XprType>
{
// TODO(phli): Add InputScalar, InputPacket. Check references to
// current Scalar/Packet to see if the intent is Input or Output.
typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
typedef traits<XprType> XprTraits;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename XprType::Nested XprTypeNested;
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename UnaryOp, typename XprType>
struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
{
typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
};
template<typename UnaryOp, typename XprType>
struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
{
typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
};
} // end namespace internal
template<typename UnaryOp, typename XprType>
class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
{
public:
// TODO(phli): Add InputScalar, InputPacket. Check references to
// current Scalar/Packet to see if the intent is Input or Output.
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef Scalar CoeffReturnType;
typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
: m_xpr(xpr), m_functor(func) {}
EIGEN_DEVICE_FUNC
const UnaryOp& functor() const { return m_functor; }
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
nestedExpression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const UnaryOp m_functor;
};
namespace internal {
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs
// are different.
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
// current Scalar/Packet to see if the intent is Inputs or Output.
typedef typename result_of<
BinaryOp(typename LhsXprType::Scalar,
typename RhsXprType::Scalar)>::type Scalar;
typedef traits<LhsXprType> XprTraits;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<
typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<
typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
enum {
Flags = 0,
};
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
{
typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
{
public:
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
// current Scalar/Packet to see if the intent is Inputs or Output.
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef Scalar CoeffReturnType;
typedef typename internal::packet_traits<CoeffReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
EIGEN_DEVICE_FUNC
const BinaryOp& functor() const { return m_functor; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const BinaryOp m_functor;
};
namespace internal {
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
: traits<ThenXprType>
{
typedef typename traits<ThenXprType>::Scalar Scalar;
typedef traits<ThenXprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
typename traits<ElseXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<ElseXprType>::Index,
typename traits<ThenXprType>::Index>::type Index;
typedef typename IfXprType::Nested IfNested;
typedef typename ThenXprType::Nested ThenNested;
typedef typename ElseXprType::Nested ElseNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
{
typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
};
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
{
typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
};
} // end namespace internal
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
{
public:
typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorSelectOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename internal::promote_storage_type<typename ThenXprType::PacketReturnType,
typename ElseXprType::PacketReturnType>::ret PacketReturnType;
typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
TensorSelectOp(const IfXprType& a_condition,
const ThenXprType& a_then,
const ElseXprType& a_else)
: m_condition(a_condition), m_then(a_then), m_else(a_else)
{ }
const IfXprType& ifExpression() const { return m_condition; }
const ThenXprType& thenExpression() const { return m_then; }
const ElseXprType& elseExpression() const { return m_else; }
protected:
typename IfXprType::Nested m_condition;
typename ThenXprType::Nested m_then;
typename ElseXprType::Nested m_else;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H

View File

@ -0,0 +1,253 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
namespace Eigen {
/** \class TensorFixedSize
* \ingroup CXX11_Tensor_Module
*
* \brief The fixed sized version of the tensor class.
*
* The fixed sized equivalent of
* Eigen::Tensor<float, 3> t(3, 5, 7);
* is
* Eigen::TensorFixedSize<float, Size<3,5,7>> t;
*/
template<typename Scalar_, typename Dimensions_, int Options_>
class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> >
{
public:
typedef TensorFixedSize<Scalar_, Dimensions_, Options_> Self;
typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_> > Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<Self>::StorageKind StorageKind;
typedef typename internal::traits<Self>::Index Index;
typedef Scalar_ Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
static const int Options = Options_;
enum {
IsAligned = bool(EIGEN_ALIGN),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
};
typedef Dimensions_ Dimensions;
static const std::size_t NumIndices = Dimensions::count;
protected:
TensorStorage<Scalar, NumIndices, Dimensions::total_size, Options, Dimensions> m_storage;
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
// work, because that uses base().coeffRef() - and we don't yet
// implement a similar class hierarchy
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
eigen_assert(checkIndexRange(indices));
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return coeff(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
{
// The bracket operator is only for vectors, use the parenthesis operator instead.
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff(index);
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
eigen_assert(checkIndexRange(indices));
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator[](Index index)
{
// The bracket operator is only for vectors, use the parenthesis operator instead
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize()
: m_storage()
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
: m_storage(other.m_storage)
{
}
#ifdef EIGEN_HAVE_RVALUE_REFERENCES
inline TensorFixedSize(Self&& other)
: m_storage(other.m_storage)
{
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other)
{
// FIXME: check that the dimensions of other match the dimensions of *this.
// Unfortunately this isn't possible yet when the rhs is an expression.
typedef TensorAssignOp<Self, const TensorFixedSize> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other)
{
// FIXME: check that the dimensions of other match the dimensions of *this.
// Unfortunately this isn't possible yet when the rhs is an expression.
typedef TensorAssignOp<Self, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
protected:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
{
using internal::array_apply_and_reduce;
using internal::array_zip_and_reduce;
using internal::greater_equal_zero_op;
using internal::logical_and_op;
using internal::lesser_op;
return true;
// check whether the indices are all >= 0
/* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
// check whether the indices fit in the dimensions
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
{
if (Options&RowMajor) {
return m_storage.dimensions().IndexOfRowMajor(indices);
} else {
return m_storage.dimensions().IndexOfColMajor(indices);
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H

View File

@ -0,0 +1,151 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
namespace Eigen {
/** \class TensorForcedEval
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reshaping class.
*
*
*/
namespace internal {
template<typename XprType>
struct traits<TensorForcedEvalOp<XprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename traits<XprType>::StorageKind StorageKind;
typedef typename traits<XprType>::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
enum {
Flags = 0,
};
};
template<typename XprType>
struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
{
typedef const TensorForcedEvalOp<XprType>& type;
};
template<typename XprType>
struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
{
typedef TensorForcedEvalOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
: m_xpr(expr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
};
template<typename ArgType, typename Device>
struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
{
typedef TensorForcedEvalOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename ArgType::Packet Packet;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
enum {
IsAligned = true,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<ArgType, Device>::Layout,
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
m_impl.evalSubExprsIfNeeded(NULL);
const Index numValues = m_impl.dimensions().TotalSize();
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.
if (!internal::is_arithmetic<CoeffReturnType>::value) {
for (Index i = 0; i < numValues; ++i) {
new(m_buffer+i) CoeffReturnType();
}
}
typedef TensorEvalToOp<const ArgType> EvalTo;
EvalTo evalToTmp(m_buffer, m_op);
internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<ArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
m_impl.cleanup();
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_device.deallocate(m_buffer);
m_buffer = NULL;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_buffer[index];
}
template<int LoadMode>
EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return internal::ploadt<Packet, LoadMode>(m_buffer + index);
}
EIGEN_DEVICE_FUNC Scalar* data() const { return m_buffer; }
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device& m_device;
CoeffReturnType* m_buffer;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H

View File

@ -0,0 +1,54 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
namespace Eigen {
template<typename Scalar_, std::size_t NumIndices_, int Options_ = 0> class Tensor;
template<typename Scalar_, typename Dimensions, int Options_ = 0> class TensorFixedSize;
template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
template<typename PlainObjectType> class TensorRef;
template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
template<typename PatchDim, typename XprType> class TensorPatchOp;
template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
template<DenseIndex DimId, typename XprType> class TensorChippingOp;
template<typename NewDimensions, typename XprType> class TensorReshapingOp;
template<typename XprType> class TensorLayoutSwapOp;
template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
template<typename Shuffle, typename XprType> class TensorShufflingOp;
template<typename Strides, typename XprType> class TensorStridingOp;
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
template<typename XprType> class TensorEvalToOp;
template<typename XprType> class TensorForcedEvalOp;
template<typename ExpressionType, typename DeviceType> class TensorDevice;
template<typename Derived, typename Device> struct TensorEvaluator;
namespace internal {
template<typename Expression, typename Device, bool Vectorizable> class TensorExecutor;
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H

View File

@ -0,0 +1,338 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
namespace Eigen {
namespace internal {
// Standard reduction functors
template <typename T> struct SumReducer
{
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
(*accum) += t;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = padd<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return static_cast<T>(0);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return saccum + predux(vaccum);
}
};
template <typename T> struct MeanReducer
{
static const bool PacketAccess = true;
MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
(*accum) += t;
scalarCount_++;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
(*accum) = padd<Packet>(*accum, p);
packetCount_++;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return static_cast<T>(0);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum / scalarCount_;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return pdiv(vaccum, pset1<Packet>(packetCount_));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return (saccum + predux(vaccum)) / (scalarCount_ + packetCount_ * packet_traits<Packet>::size);
}
protected:
int scalarCount_;
int packetCount_;
};
template <typename T> struct MaxReducer
{
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
if (t > *accum) { *accum = t; }
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmax<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return -(std::numeric_limits<T>::max)();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(-(std::numeric_limits<T>::max)());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return (std::max)(saccum, predux_max(vaccum));
}
};
template <typename T> struct MinReducer
{
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
if (t < *accum) { *accum = t; }
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmin<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return (std::numeric_limits<T>::max)();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>((std::numeric_limits<T>::max)());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return (std::min)(saccum, predux_min(vaccum));
}
};
template <typename T> struct ProdReducer
{
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
(*accum) *= t;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmul<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return static_cast<T>(1);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(1);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
return saccum * predux_mul(vaccum);
}
};
#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
// We're not compiling a cuda kernel
template <typename T> struct UniformRandomGenerator {
static const bool PacketAccess = true;
template<typename Index>
T operator()(Index, Index = 0) const {
return random<T>();
}
template<typename Index>
typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
const int packetSize = internal::packet_traits<T>::size;
EIGEN_ALIGN_DEFAULT T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = random<T>();
}
return internal::pload<typename internal::packet_traits<T>::type>(values);
}
};
#else
// We're compiling a cuda kernel
template <typename T> struct UniformRandomGenerator;
template <> struct UniformRandomGenerator<float> {
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC UniformRandomGenerator() {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(0, tid, 0, &m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
float operator()(Index, Index = 0) const {
return curand_uniform(&m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
float4 packetOp(Index, Index = 0) const {
return curand_uniform4(&m_state);
}
private:
mutable curandStatePhilox4_32_10_t m_state;
};
template <> struct UniformRandomGenerator<double> {
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC UniformRandomGenerator() {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(0, tid, 0, &m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
double operator()(Index, Index = 0) const {
return curand_uniform_double(&m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
double2 packetOp(Index, Index = 0) const {
return curand_uniform2_double(&m_state);
}
private:
mutable curandStatePhilox4_32_10_t m_state;
};
#endif
#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && __cplusplus > 199711
// We're not compiling a cuda kernel
template <typename T> struct NormalRandomGenerator {
static const bool PacketAccess = true;
NormalRandomGenerator() : m_distribution(0, 1) {}
NormalRandomGenerator(const NormalRandomGenerator& other) : m_distribution(other.m_distribution) { }
template<typename Index>
T operator()(Index, Index = 0) const {
return m_distribution(m_generator);
}
template<typename Index>
typename internal::packet_traits<T>::type packetOp(Index, Index = 0) const {
const int packetSize = internal::packet_traits<T>::size;
EIGEN_ALIGN_DEFAULT T values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = m_distribution(m_generator);
}
return internal::pload<typename internal::packet_traits<T>::type>(values);
}
mutable std::normal_distribution<T> m_distribution;
mutable std::default_random_engine m_generator;
};
#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
// We're compiling a cuda kernel
template <typename T> struct NormalRandomGenerator;
template <> struct NormalRandomGenerator<float> {
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC NormalRandomGenerator() {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(0, tid, 0, &m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
float operator()(Index, Index = 0) const {
return curand_normal(&m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
float4 packetOp(Index, Index = 0) const {
return curand_normal4(&m_state);
}
private:
mutable curandStatePhilox4_32_10_t m_state;
};
template <> struct NormalRandomGenerator<double> {
static const bool PacketAccess = true;
EIGEN_DEVICE_FUNC NormalRandomGenerator() {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(0, tid, 0, &m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
double operator()(Index, Index = 0) const {
return curand_normal_double(&m_state);
}
template<typename Index> EIGEN_DEVICE_FUNC
double2 packetOp(Index, Index = 0) const {
return curand_normal2_double(&m_state);
}
private:
mutable curandStatePhilox4_32_10_t m_state;
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H

View File

@ -0,0 +1,53 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
namespace Eigen {
namespace internal {
template<>
struct significant_decimals_impl<std::string>
: significant_decimals_default_impl<std::string, true>
{};
}
template <typename T>
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
// Evaluate the expression if needed
TensorForcedEvalOp<const T> eval = expr.eval();
TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
tensor.evalSubExprsIfNeeded(NULL);
typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
typedef typename T::Index Index;
typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
const Index total_size = internal::array_prod(tensor.dimensions());
// Print the tensor as a 1d vector or a 2d matrix.
if (internal::array_size<Dimensions>::value == 1) {
Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
os << array;
} else {
const Index first_dim = tensor.dimensions()[0];
static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
os << matrix;
}
// Cleanup.
tensor.cleanup();
return os;
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H

View File

@ -0,0 +1,382 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
namespace Eigen {
/** \class TensorImagePatch
* \ingroup CXX11_Tensor_Module
*
* \brief Patch extraction specialized for image processing.
* This assumes that the input has a least 3 dimensions ordered as follow:
* 1st dimension: channels (of size d)
* 2nd dimension: rows (of size r)
* 3rd dimension: columns (of size c)
* There can be additional dimensions such as time (for video) or batch (for
* bulk processing after the first 3.
* Calling the image patch code with patch_rows and patch_cols is equivalent
* to calling the regular patch extraction code with parameters d, patch_rows,
* patch_cols, and 1 for all the additional dimensions.
*/
namespace internal {
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
};
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
{
typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
};
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
{
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
};
} // end namespace internal
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex row_strides, DenseIndex col_strides,
PaddingType padding_type)
: m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
m_row_strides(row_strides), m_col_strides(col_strides),
m_padding_type(padding_type) {}
EIGEN_DEVICE_FUNC
DenseIndex patch_rows() const { return m_patch_rows; }
EIGEN_DEVICE_FUNC
DenseIndex patch_cols() const { return m_patch_cols; }
EIGEN_DEVICE_FUNC
DenseIndex row_strides() const { return m_row_strides; }
EIGEN_DEVICE_FUNC
DenseIndex col_strides() const { return m_col_strides; }
EIGEN_DEVICE_FUNC
PaddingType padding_type() const { return m_padding_type; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const DenseIndex m_patch_rows;
const DenseIndex m_patch_cols;
const DenseIndex m_row_strides;
const DenseIndex m_col_strides;
const PaddingType m_padding_type;
};
// Eval as rvalue
template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
{
typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = NumDims == 5,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
// Caches a few variables.
m_inputRows = input_dims[1];
m_inputCols = input_dims[2];
m_row_strides = op.row_strides();
m_col_strides = op.col_strides();
// We only support same strides for both dimensions and square patches.
eigen_assert(m_row_strides == m_col_strides);
switch (op.padding_type()) {
case PADDING_VALID:
m_outputRows = ceil((m_inputRows - op.patch_rows() + 1.f) / static_cast<float>(m_row_strides));
m_outputCols = ceil((m_inputCols - op.patch_cols() + 1.f) / static_cast<float>(m_col_strides));
// Calculate the padding
m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
break;
case PADDING_SAME:
m_outputRows = ceil(m_inputRows / static_cast<float>(m_row_strides));
m_outputCols = ceil(m_inputCols / static_cast<float>(m_col_strides));
// Calculate the padding
m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + op.patch_rows() - m_inputRows) / 2;
m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + op.patch_cols() - m_inputCols) / 2;
break;
default:
eigen_assert(false && "unexpected padding");
}
// Dimensions for result of extraction.
// 0: depth
// 1: patch_rows
// 2: patch_cols
// 3: number of patches
// 4 and beyond: anything else (such as batch).
m_dimensions[0] = input_dims[0];
m_dimensions[1] = op.patch_rows();
m_dimensions[2] = op.patch_cols();
m_dimensions[3] = m_outputRows * m_outputCols;
for (int i = 4; i < NumDims; ++i) {
m_dimensions[i] = input_dims[i-1];
}
// Strides for moving the patch in various dimensions.
m_colStride = m_dimensions[1];
m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
m_otherStride = m_patchStride * m_dimensions[3];
// Strides for navigating through the input tensor.
m_rowInputStride = input_dims[0];
m_colInputStride = input_dims[0] * input_dims[1];
m_patchInputStride = input_dims[0] * input_dims[1] * input_dims[2];
// Fast representations of different variables.
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
// Number of patches in the width dimension.
m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
m_fastDimZero = internal::TensorIntDivisor<Index>(m_dimensions[0]);
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Patch index corresponding to the passed in index.
const Index patchIndex = index / m_fastPatchStride;
// Find the offset of the element wrt the location of the first element.
const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastDimZero;
// Other ways to index this element.
const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
const Index colIndex = patch2DIndex / m_fastOutputRows;
const Index colOffset = patchOffset / m_fastColStride;
// Calculate col index in the input original tensor.
const Index inputCol = colIndex * m_col_strides + colOffset - m_colPaddingLeft;
if (inputCol < 0 || inputCol >= m_inputCols) {
return Scalar(0);
}
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
const Index rowOffset = patchOffset - colOffset * m_colStride;
// Calculate row index in the original input tensor.
const Index inputRow = rowIndex * m_row_strides + rowOffset - m_rowPaddingTop;
if (inputRow < 0 || inputRow >= m_inputRows) {
return Scalar(0);
}
const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex * m_patchInputStride;
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const Index packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
const Index indices[2] = {index, index + packetSize - 1};
const Index patchIndex = indices[0] / m_fastPatchStride;
if (patchIndex != indices[1] / m_fastPatchStride) {
return packetWithPossibleZero(index);
}
const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
// Find the offset of the element wrt the location of the first element.
const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastDimZero,
(indices[1] - patchIndex * m_patchStride) / m_fastDimZero};
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
const Index colIndex = patch2DIndex / m_fastOutputRows;
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
// Calculate col indices in the original input tensor.
const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
// all zeros
return internal::pset1<PacketReturnType>(Scalar(0));
}
if (inputCols[0] == inputCols[1]) {
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
// Calculate col indices in the original input tensor.
const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
// all zeros
return internal::pset1<PacketReturnType>(Scalar(0));
}
if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
// no padding
const Index depth = index - (index / m_fastDimZero) * m_dimensions[0];
const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
return m_impl.template packet<Unaligned>(inputIndex);
}
}
return packetWithPossibleZero(index);
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
Index rowPaddingTop() const { return m_rowPaddingTop; }
Index colPaddingLeft() const { return m_colPaddingLeft; }
Index outputRows() const { return m_outputRows; }
Index outputCols() const { return m_outputCols; }
Index userRowStride() const { return m_row_strides; }
Index userColStride() const { return m_col_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
{
// Location of the first element of the patch.
// 0: d, 1: patch_rows, 2: patch_cols, 3: number of patches, 4: number of batches
const Index patchIndex = coords[3];
array<Index, NumDims-1> inputCoords;
inputCoords[0] = coords[0]; // depth
inputCoords[1] = patchIndex / m_inputCols + coords[1] - m_rowPaddingTop;
inputCoords[2] = patchIndex - patchIndex / m_inputCols * m_inputCols + coords[2] - m_colPaddingLeft;
inputCoords[3] = coords[4]; // batch
// If the computed coordinates are outside the original image perimeter, return 0.
if (inputCoords[1] < 0 || inputCoords[1] >= m_inputRows ||
inputCoords[2] < 0 || inputCoords[2] >= m_inputCols) {
return Scalar(0);
}
if (TensorEvaluator<ArgType, Device>::CoordAccess) {
return m_impl.coeff(inputCoords);
} else {
Index inputIndex =
inputCoords[3] * m_patchInputStride +
inputCoords[2] * m_colInputStride +
inputCoords[1] * m_rowInputStride +
inputCoords[0];
return m_impl.coeff(inputIndex);
}
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
Dimensions m_dimensions;
Index m_otherStride;
Index m_patchStride;
Index m_colStride;
Index m_row_strides;
Index m_col_strides;
internal::TensorIntDivisor<Index> m_fastOtherStride;
internal::TensorIntDivisor<Index> m_fastPatchStride;
internal::TensorIntDivisor<Index> m_fastColStride;
Index m_rowInputStride;
Index m_colInputStride;
Index m_patchInputStride;
Index m_inputRows;
Index m_inputCols;
Index m_outputRows;
Index m_outputCols;
Index m_rowPaddingTop;
Index m_colPaddingLeft;
internal::TensorIntDivisor<Index> m_fastOutputRows;
internal::TensorIntDivisor<Index> m_fastDimZero;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H

View File

@ -0,0 +1,419 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#ifdef EIGEN_HAS_CONSTEXPR
namespace Eigen {
/** \internal
*
* \class TensorIndexList
* \ingroup CXX11_Tensor_Module
*
* \brief Set of classes used to encode a set of Tensor dimensions/indices.
*
* The indices in the list can be known at compile time or at runtime. A mix
* of static and dynamic indices can also be provided if needed. The tensor
* code will attempt to take advantage of the indices that are known at
* compile time to optimize the code it generates.
*
* This functionality requires a c++11 compliant compiler. If your compiler
* is older you need to use arrays of indices instead.
*
* Several examples are provided in the cxx11_tensor_index_list.cpp file.
*
* \sa Tensor
*/
template <DenseIndex n>
struct type2index {
static const DenseIndex value = n;
constexpr operator DenseIndex() const { return n; }
void set(DenseIndex val) {
eigen_assert(val == n);
}
};
namespace internal {
template <typename T>
void update_value(T& val, DenseIndex new_val) {
val = new_val;
}
template <DenseIndex n>
void update_value(type2index<n>& val, DenseIndex new_val) {
val.set(new_val);
}
template <typename T>
struct is_compile_time_constant {
static constexpr bool value = false;
};
template <DenseIndex idx>
struct is_compile_time_constant<type2index<idx> > {
static constexpr bool value = true;
};
template <DenseIndex idx>
struct is_compile_time_constant<const type2index<idx> > {
static constexpr bool value = true;
};
template <DenseIndex idx>
struct is_compile_time_constant<type2index<idx>& > {
static constexpr bool value = true;
};
template <DenseIndex idx>
struct is_compile_time_constant<const type2index<idx>& > {
static constexpr bool value = true;
};
template <DenseIndex Idx>
struct tuple_coeff {
template <typename... T>
static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
return std::get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
}
template <typename... T>
static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
if (i == Idx) {
update_value(std::get<Idx>(t), value);
} else {
tuple_coeff<Idx-1>::set(i, t, value);
}
}
template <typename... T>
static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>& t) {
return ((i == Idx) & is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value) ||
tuple_coeff<Idx-1>::value_known_statically(i, t);
}
template <typename... T>
static constexpr bool values_up_to_known_statically(const std::tuple<T...>& t) {
return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
tuple_coeff<Idx-1>::values_up_to_known_statically(t);
}
template <typename... T>
static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>& t) {
return is_compile_time_constant<typename std::tuple_element<Idx, std::tuple<T...> >::type>::value &&
is_compile_time_constant<typename std::tuple_element<Idx-1, std::tuple<T...> >::type>::value &&
std::get<Idx>(t) > std::get<Idx-1>(t) &&
tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
}
};
template <>
struct tuple_coeff<0> {
template <typename... T>
static constexpr DenseIndex get(const DenseIndex i, const std::tuple<T...>& t) {
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
return std::get<0>(t) * (i == 0);
}
template <typename... T>
static void set(const DenseIndex i, std::tuple<T...>& t, const DenseIndex value) {
eigen_assert (i == 0);
update_value(std::get<0>(t), value);
}
template <typename... T>
static constexpr bool value_known_statically(const DenseIndex i, const std::tuple<T...>&) {
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value & (i == 0);
}
template <typename... T>
static constexpr bool values_up_to_known_statically(const std::tuple<T...>&) {
return is_compile_time_constant<typename std::tuple_element<0, std::tuple<T...> >::type>::value;
}
template <typename... T>
static constexpr bool values_up_to_statically_known_to_increase(const std::tuple<T...>&) {
return true;
}
};
} // namespace internal
template<typename FirstType, typename... OtherTypes>
struct IndexList : std::tuple<FirstType, OtherTypes...> {
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
}
constexpr IndexList(const std::tuple<FirstType, OtherTypes...>& other) : std::tuple<FirstType, OtherTypes...>(other) { }
constexpr IndexList() : std::tuple<FirstType, OtherTypes...>() { }
constexpr bool value_known_statically(const DenseIndex i) const {
return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
}
constexpr bool all_values_known_statically() const {
return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
}
constexpr bool values_statically_known_to_increase() const {
return internal::tuple_coeff<std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
}
};
template<typename FirstType, typename... OtherTypes>
constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
return std::make_tuple(val1, other_vals...);
}
namespace internal {
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
size_t result = 1;
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i];
}
return result;
};
template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
};
template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
};
template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
return std::get<n>(a);
}
template<DenseIndex n, typename FirstType, typename... OtherTypes> constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
return std::get<n>(a);
}
template <typename T>
struct index_known_statically {
constexpr bool operator() (DenseIndex) const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_known_statically<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_known_statically<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
}
};
template <typename T>
struct all_indices_known_statically {
constexpr bool operator() () const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct all_indices_known_statically<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() () const {
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
}
};
template <typename FirstType, typename... OtherTypes>
struct all_indices_known_statically<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() () const {
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
}
};
template <typename T>
struct indices_statically_known_to_increase {
constexpr bool operator() () const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct indices_statically_known_to_increase<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() () const {
return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
}
};
template <typename FirstType, typename... OtherTypes>
struct indices_statically_known_to_increase<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() () const {
return IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
}
};
template <typename Tx>
struct index_statically_eq {
constexpr bool operator() (DenseIndex, DenseIndex) const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_eq<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] == value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_eq<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] == value);
}
};
template <typename T>
struct index_statically_ne {
constexpr bool operator() (DenseIndex, DenseIndex) const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_ne<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] != value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_ne<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] != value);
}
};
template <typename T>
struct index_statically_gt {
constexpr bool operator() (DenseIndex, DenseIndex) const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_gt<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] > value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_gt<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] > value);
}
};
template <typename T>
struct index_statically_lt {
constexpr bool operator() (DenseIndex, DenseIndex) const {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_lt<IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] < value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_lt<const IndexList<FirstType, OtherTypes...> > {
constexpr bool operator() (const DenseIndex i, const DenseIndex value) const {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>()[i] < value);
}
};
} // end namespace internal
} // end namespace Eigen
#else
namespace Eigen {
namespace internal {
// No C++11 support
template <typename T>
struct index_known_statically {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex) const{
return false;
}
};
template <typename T>
struct all_indices_known_statically {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
return false;
}
};
template <typename T>
struct indices_statically_known_to_increase {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() () const {
return false;
}
};
template <typename T>
struct index_statically_eq {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
return false;
}
};
template <typename T>
struct index_statically_ne {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
return false;
}
};
template <typename T>
struct index_statically_gt {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
return false;
}
};
template <typename T>
struct index_statically_lt {
EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool operator() (DenseIndex, DenseIndex) const{
return false;
}
};
} // end namespace internal
} // end namespace Eigen
#endif
#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H

View File

@ -0,0 +1,70 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
#include <initializer_list>
namespace Eigen {
/** \class TensorInitializer
* \ingroup CXX11_Tensor_Module
*
* \brief Helper template to initialize Tensors from std::initializer_lists.
*/
namespace internal {
template <typename Derived, int N>
struct Initializer {
typedef std::initializer_list<
typename Initializer<Derived, N - 1>::InitList> InitList;
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
const InitList& vals) {
int i = 0;
for (auto v : vals) {
(*indices)[traits<Derived>::NumDimensions - N] = i++;
Initializer<Derived, N - 1>::run(tensor, indices, v);
}
}
};
template <typename Derived>
struct Initializer<Derived, 1> {
typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
const InitList& vals) {
int i = 0;
// There is likely a faster way to do that than iterating.
for (auto v : vals) {
(*indices)[traits<Derived>::NumDimensions - 1] = i++;
tensor.coeffRef(*indices) = v;
}
}
};
template <typename Derived, int N>
void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H

View File

@ -0,0 +1,86 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
namespace Eigen {
/** \internal
*
* \class TensorIntDiv
* \ingroup CXX11_Tensor_Module
*
* \brief Fast integer division by a constant.
*
* See the paper from Granlund and Montgomery for explanation.
* (at http://dx.doi.org/10.1145/773473.178249)
*
* \sa Tensor
*/
namespace internal {
template <typename T>
struct TensorIntDivisor {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
multiplier = 0;
shift1 = 0;
shift2 = 0;
}
// Must have 1 <= divider <= 2^31-1
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
const int N = 32;
eigen_assert(divider > 0);
eigen_assert(divider <= (1<<(N-1)) - 1);
// fast ln2
#ifndef __CUDA_ARCH__
const int leading_zeros = __builtin_clz(divider);
#else
const int leading_zeros = __clz(divider);
#endif
const int log_div = N - (leading_zeros+1);
multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
shift1 = log_div > 1 ? 1 : log_div;
shift2 = log_div > 1 ? log_div-1 : 0;
}
// Must have 0 <= numerator <= 2^32-1
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
const int N = 32;
eigen_assert(numerator >= 0);
eigen_assert(numerator <= (1ull<<N) - 1);
uint32_t t1 = (multiplier * numerator) >> 32;
uint32_t t = (static_cast<uint32_t>(numerator) - t1) >> shift1;
return (t1 + t) >> shift2;
}
private:
uint64_t multiplier;
int32_t shift1;
int32_t shift2;
};
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T>& divisor) {
return divisor.divide(numerator);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H

View File

@ -0,0 +1,198 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
namespace Eigen {
/** \class TensorLayoutSwap
* \ingroup CXX11_Tensor_Module
*
* \brief Swap the layout from col-major to row-major, or row-major
* to col-major, and invert the order of the dimensions.
*
* Beware: the dimensions are reversed by this operation. If you want to
* preserve the ordering of the dimensions, you need to combine this
* operation with a shuffle.
*
* \example:
* Tensor<float, 2, ColMajor> input(2, 4);
* Tensor<float, 2, RowMajor> output = input.swap_layout();
* eigen_assert(output.dimension(0) == 4);
* eigen_assert(output.dimension(1) == 2);
*
* array<int, 2> shuffle(1, 0);
* output = input.swap_layout().shuffle(shuffle);
* eigen_assert(output.dimension(0) == 2);
* eigen_assert(output.dimension(1) == 4);
*
*/
namespace internal {
template<typename XprType>
struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
};
template<typename XprType>
struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
{
typedef const TensorLayoutSwapOp<XprType>& type;
};
template<typename XprType>
struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
{
typedef TensorLayoutSwapOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
: m_xpr(expr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorLayoutSwapOp, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
};
// Eval as rvalue
template<typename ArgType, typename Device>
struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
{
typedef TensorLayoutSwapOp<ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
for(int i = 0; i < NumDims; ++i) {
m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
}
}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
return m_impl.evalSubExprsIfNeeded(data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_impl.template packet<LoadMode>(index);
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
Dimensions m_dimensions;
};
// Eval as lvalue
template<typename ArgType, typename Device>
struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
: public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
{
typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
typedef TensorLayoutSwapOp<ArgType> XprType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H

View File

@ -0,0 +1,291 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
namespace Eigen {
/** \class TensorMap
* \ingroup CXX11_Tensor_Module
*
* \brief A tensor expression mapping an existing array of data.
*
*/
template<typename PlainObjectType, int Options_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_> >
{
public:
typedef TensorMap<PlainObjectType, Options_> Self;
typedef typename PlainObjectType::Base Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
typedef typename internal::traits<PlainObjectType>::Index Index;
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
/* typedef typename internal::conditional<
bool(internal::is_lvalue<PlainObjectType>::value),
Scalar *,
const Scalar *>::type
PointerType;*/
typedef Scalar* PointerType;
typedef PointerType PointerArgType;
static const int Options = Options_;
static const Index NumIndices = PlainObjectType::NumIndices;
typedef typename PlainObjectType::Dimensions Dimensions;
enum {
IsAligned = ((int(Options_)&Aligned)==Aligned),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = PlainObjectType::Layout,
CoordAccess = true,
};
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#endif
inline TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
: m_data(dataPtr), m_dimensions(dimensions)
{ }
template <typename Dimensions>
EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
: m_data(dataPtr), m_dimensions(dimensions)
{ }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar* data() { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar* data() const { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
// eigen_assert(checkIndexRange(indices));
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(indices);
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(indices);
return m_data[index];
}
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 1 == NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, otherIndices...}});
return m_data[index];
}
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_data[index];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i1 + i0 * m_dimensions[0];
return m_data[index];
} else {
const Index index = i0 + i1 * m_dimensions[0];
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
return m_data[index];
}
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
// eigen_assert(checkIndexRange(indices));
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(indices);
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(indices);
return m_data[index];
}
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
{
static_assert(sizeof...(otherIndices) + 1 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
const std::size_t NumDims = sizeof...(otherIndices) + 1;
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, otherIndices...}});
return m_data[index];
}
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_data[index];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i1 + i0 * m_dimensions[0];
return m_data[index];
} else {
const Index index = i0 + i1 * m_dimensions[0];
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i2 + m_dimensions[1] * (i1 + m_dimensions[0] * i0);
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
return m_data[index];
}
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other)
{
typedef TensorAssignOp<Self, const Self> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Self& operator=(const OtherDerived& other)
{
typedef TensorAssignOp<Self, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
private:
Scalar* m_data;
Dimensions m_dimensions;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H

View File

@ -0,0 +1,600 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
namespace Eigen {
/** \class TensorReshaping
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reshaping class.
*
*
*/
namespace internal {
template<typename NewDimensions, typename XprType>
struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = array_size<NewDimensions>::value;
static const int Layout = XprTraits::Layout;
};
template<typename NewDimensions, typename XprType>
struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
{
typedef const TensorReshapingOp<NewDimensions, XprType>& type;
};
template<typename NewDimensions, typename XprType>
struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
{
typedef TensorReshapingOp<NewDimensions, XprType> type;
};
} // end namespace internal
template<typename NewDimensions, typename XprType>
class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorReshapingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
: m_xpr(expr), m_dims(dims) {}
EIGEN_DEVICE_FUNC
const NewDimensions& dimensions() const { return m_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other)
{
typedef TensorAssignOp<TensorReshapingOp, const TensorReshapingOp> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorReshapingOp, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
const NewDimensions m_dims;
};
// Eval as rvalue
template<typename NewDimensions, typename ArgType, typename Device>
struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
{
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
typedef NewDimensions Dimensions;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dimensions(op.dimensions())
{
// The total size of the reshaped tensor must be equal to the total size
// of the input tensor.
eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
return m_impl.evalSubExprsIfNeeded(data);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_impl.template packet<LoadMode>(index);
}
EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_impl.data(); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
NewDimensions m_dimensions;
};
// Eval as lvalue
template<typename NewDimensions, typename ArgType, typename Device>
struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
: public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
{
typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
typedef NewDimensions Dimensions;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
};
/** \class TensorSlicing
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor slicing class.
*
*
*/
namespace internal {
template<typename StartIndices, typename Sizes, typename XprType>
struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = array_size<StartIndices>::value;
static const int Layout = XprTraits::Layout;
};
template<typename StartIndices, typename Sizes, typename XprType>
struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
{
typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
};
template<typename StartIndices, typename Sizes, typename XprType>
struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
{
typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
};
} // end namespace internal
template<typename StartIndices, typename Sizes, typename XprType>
class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorSlicingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
: m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
EIGEN_DEVICE_FUNC
const StartIndices& startIndices() const { return m_indices; }
EIGEN_DEVICE_FUNC
const Sizes& sizes() const { return m_sizes; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorSlicingOp, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other)
{
typedef TensorAssignOp<TensorSlicingOp, const TensorSlicingOp> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
const StartIndices m_indices;
const Sizes m_sizes;
};
// Eval as rvalue
template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
{
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
static const int NumDims = internal::array_size<Sizes>::value;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets and sizes.
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
{
for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Sizes& output_dims = op.sizes();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
}
m_outputStrides[0] = 1;
m_fastOutputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
m_inputStrides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
}
m_outputStrides[NumDims-1] = 1;
m_fastOutputStrides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef Sizes Dimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
m_impl.evalSubExprsIfNeeded(NULL);
if (internal::is_arithmetic<Scalar>::value && data && m_impl.data()) {
Index contiguous_values = 1;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) {
contiguous_values *= dimensions()[i];
if (dimensions()[i] != m_impl.dimensions()[i]) {
break;
}
}
} else {
for (int i = NumDims-1; i >= 0; --i) {
contiguous_values *= dimensions()[i];
if (dimensions()[i] != m_impl.dimensions()[i]) {
break;
}
}
}
// Use memcpy if it's going to be faster than using the regular evaluation.
if (contiguous_values > 2 * m_device.numThreads()) {
Scalar* src = m_impl.data();
for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
Index offset = srcCoeff(i);
m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
}
return false;
}
}
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_fastOutputStrides[i];
const Index idx1 = indices[1] / m_fastOutputStrides[i];
inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += (indices[0] + m_offsets[0]);
inputIndices[1] += (indices[1] + m_offsets[0]);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_fastOutputStrides[i];
const Index idx1 = indices[1] / m_fastOutputStrides[i];
inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
}
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
typename internal::remove_const<CoeffReturnType>::type values[packetSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
for (int i = 1; i < packetSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords)
{
array<Index, NumDims> inputCoords;
for (int i = 0; i < NumDims; ++i) {
inputCoords = coords[i] + this->m_offsets[i];
}
return m_impl.coeff(inputCoords);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
Scalar* result = m_impl.data();
if (result) {
Index offset = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) {
if (m_dimensions[i] != m_impl.dimensions()[i]) {
offset += m_offsets[i] * m_inputStrides[i];
for (int j = i+1; j < NumDims; ++j) {
if (m_dimensions[j] > 1) {
return NULL;
}
offset += m_offsets[j] * m_inputStrides[j];
}
break;
}
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
if (m_dimensions[i] != m_impl.dimensions()[i]) {
offset += m_offsets[i] * m_inputStrides[i];
for (int j = i-1; j >= 0; --j) {
if (m_dimensions[j] > 1) {
return NULL;
}
offset += m_offsets[j] * m_inputStrides[j];
}
break;
}
}
}
return result + offset;
}
return NULL;
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += (index + m_offsets[0]);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += (index + m_offsets[NumDims-1]);
}
return inputIndex;
}
array<Index, NumDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const Device& m_device;
Dimensions m_dimensions;
const StartIndices m_offsets;
};
// Eval as lvalue
template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
: public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
{
typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
static const int NumDims = internal::array_size<Sizes>::value;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef Sizes Dimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += (indices[0] + this->m_offsets[0]);
inputIndices[1] += (indices[1] + this->m_offsets[0]);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
}
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
}
else {
EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
for (int i = 1; i < packetSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<Index, NumDims>& coords)
{
array<Index, NumDims> inputCoords;
for (int i = 0; i < NumDims; ++i) {
inputCoords = coords[i] + this->m_offsets[i];
}
return this->m_impl.coeffRef(inputCoords);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H

View File

@ -0,0 +1,361 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
namespace Eigen {
/** \class TensorPadding
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor padding class.
* At the moment only 0-padding is supported.
*
*/
namespace internal {
template<typename PaddingDimensions, typename XprType>
struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename PaddingDimensions, typename XprType>
struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
{
typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
};
template<typename PaddingDimensions, typename XprType>
struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
{
typedef TensorPaddingOp<PaddingDimensions, XprType> type;
};
} // end namespace internal
template<typename PaddingDimensions, typename XprType>
class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorPaddingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims)
: m_xpr(expr), m_padding_dims(padding_dims) {}
EIGEN_DEVICE_FUNC
const PaddingDimensions& padding() const { return m_padding_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const PaddingDimensions m_padding_dims;
};
// Eval as rvalue
template<typename PaddingDimensions, typename ArgType, typename Device>
struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
{
typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<PaddingDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_padding(op.padding())
{
// Compute dimensions
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] += m_padding[i].first + m_padding[i].second;
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
} else {
m_inputStrides[NumDims - 1] = 1;
m_outputStrides[NumDims] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
}
m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
}
}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
return Scalar(0);
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
return Scalar(0);
}
inputIndex += (index - m_padding[0].first);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i+1];
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
return Scalar(0);
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
if (index < m_padding[NumDims-1].first ||
index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
return Scalar(0);
}
inputIndex += (index - m_padding[NumDims-1].first);
}
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return packetColMajor(index);
}
return packetRowMajor(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
{
Index inputIndex;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
const Index idx = coords[0];
if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
return Scalar(0);
}
inputIndex = idx - m_padding[0].first;
for (int i = 1; i < NumDims; ++i) {
const Index idx = coords[i];
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
return Scalar(0);
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
}
} else {
const Index idx = coords[NumDims-1];
if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
return Scalar(0);
}
inputIndex = idx - m_padding[NumDims-1].first;
for (int i = NumDims - 2; i >= 0; --i) {
const Index idx = coords[i];
if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
return Scalar(0);
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
}
}
return m_impl.coeff(inputIndex);
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
for (int i = NumDims - 1; i > 0; --i) {
const Index first = index;
const Index last = index + packetSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const Index lastPaddedRight = m_outputStrides[i+1];
if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
else {
// Every other case
return packetWithPossibleZero(initialIndex);
}
}
const Index last = index + packetSize - 1;
const Index first = index;
const Index lastPaddedLeft = m_padding[0].first;
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
const Index lastPaddedRight = m_outputStrides[1];
if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[0].first);
return m_impl.template packet<Unaligned>(inputIndex);
}
// Every other case
return packetWithPossibleZero(initialIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
for (int i = 0; i < NumDims - 1; ++i) {
const Index first = index;
const Index last = index + packetSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
const Index lastPaddedRight = m_outputStrides[i];
if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i+1];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
else {
// Every other case
return packetWithPossibleZero(initialIndex);
}
}
const Index last = index + packetSize - 1;
const Index first = index;
const Index lastPaddedLeft = m_padding[NumDims-1].first;
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
const Index lastPaddedRight = m_outputStrides[NumDims-1];
if (last < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= firstPaddedRight && last < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(Scalar(0));
}
else if (first >= lastPaddedLeft && last < firstPaddedRight) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[NumDims-1].first);
return m_impl.template packet<Unaligned>(inputIndex);
}
// Every other case
return packetWithPossibleZero(initialIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
Dimensions m_dimensions;
array<Index, NumDims+1> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
PaddingDimensions m_padding;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H

View File

@ -0,0 +1,248 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
namespace Eigen {
/** \class TensorPatch
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor patch class.
*
*
*/
namespace internal {
template<typename PatchDim, typename XprType>
struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
};
template<typename PatchDim, typename XprType>
struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
{
typedef const TensorPatchOp<PatchDim, XprType>& type;
};
template<typename PatchDim, typename XprType>
struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
{
typedef TensorPatchOp<PatchDim, XprType> type;
};
} // end namespace internal
template<typename PatchDim, typename XprType>
class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorPatchOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
: m_xpr(expr), m_patch_dims(patch_dims) {}
EIGEN_DEVICE_FUNC
const PatchDim& patch_dims() const { return m_patch_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const PatchDim m_patch_dims;
};
// Eval as rvalue
template<typename PatchDim, typename ArgType, typename Device>
struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
{
typedef TensorPatchOp<PatchDim, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
// Only column major tensors are supported for now.
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
Index num_patches = 1;
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const PatchDim& patch_dims = op.patch_dims();
for (int i = 0; i < NumDims-1; ++i) {
m_dimensions[i] = patch_dims[i];
num_patches *= (input_dims[i] - patch_dims[i] + 1);
}
m_dimensions[NumDims-1] = num_patches;
m_inputStrides[0] = 1;
m_patchStrides[0] = 1;
for (int i = 1; i < NumDims-1; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
}
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Find the location of the first element of the patch.
Index patchIndex = index / m_outputStrides[NumDims - 1];
// Find the offset of the element wrt the location of the first element.
Index patchOffset = index - patchIndex * m_outputStrides[NumDims - 1];
Index inputIndex = 0;
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
const Index offsetIdx = patchOffset / m_outputStrides[i];
patchOffset -= offsetIdx * m_outputStrides[i];
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
}
inputIndex += (patchIndex + patchOffset);
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
Index indices[2] = {index, index + packetSize - 1};
Index patchIndices[2] = {indices[0] / m_outputStrides[NumDims - 1],
indices[1] / m_outputStrides[NumDims - 1]};
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[NumDims - 1],
indices[1] - patchIndices[1] * m_outputStrides[NumDims - 1]};
Index inputIndices[2] = {0, 0};
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
patchIndices[1] / m_patchStrides[i]};
patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
patchOffsets[1] / m_outputStrides[i]};
patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
}
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
for (int i = 1; i < packetSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
{
// Location of the first element of the patch.
const Index patchIndex = coords[NumDims - 1];
if (TensorEvaluator<ArgType, Device>::CoordAccess) {
array<Index, NumDims-1> inputCoords;
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
const Index offsetIdx = coords[i];
inputCoords[i] = coords[i] + patchIdx;
}
inputCoords[0] = (patchIndex + coords[0]);
return m_impl.coeff(inputCoords);
}
else {
Index inputIndex = 0;
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
const Index offsetIdx = coords[i];
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
}
inputIndex += (patchIndex + coords[0]);
return m_impl.coeff(inputIndex);
}
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims-1> m_inputStrides;
array<Index, NumDims-1> m_patchStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H

View File

@ -0,0 +1,426 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
namespace Eigen {
/** \class TensorReduction
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reduction class.
*
*/
namespace internal {
template<typename Op, typename Dims, typename XprType>
struct traits<TensorReductionOp<Op, Dims, XprType> >
: traits<XprType>
{
typedef typename traits<XprType>::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename traits<XprType>::StorageKind StorageKind;
typedef typename traits<XprType>::Index Index;
typedef typename XprType::Nested Nested;
};
template<typename Op, typename Dims, typename XprType>
struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
{
typedef const TensorReductionOp<Op, Dims, XprType>& type;
};
template<typename Op, typename Dims, typename XprType>
struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
{
typedef TensorReductionOp<Op, Dims, XprType> type;
};
template <typename ReducedDims, int NumTensorDims, int Layout>
struct are_inner_most_dims {
static const bool value = false;
};
template <typename ReducedDims, int NumTensorDims, int Layout>
struct preserve_inner_most_dims {
static const bool value = false;
};
#ifdef EIGEN_HAS_CONSTEXPR
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
index_statically_eq<ReducedDims>()(0, 0) &&
index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
};
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value) &&
index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
};
template <typename ReducedDims, int NumTensorDims>
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
index_statically_gt<ReducedDims>()(0, 0);
};
template <typename ReducedDims, int NumTensorDims>
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
static const bool value = indices_statically_known_to_increase<ReducedDims>()() &&
index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
};
#endif
template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
}
}
};
template <typename Self, typename Op>
struct GenericDimReducer<0, Self, Op> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
for (int j = 0; j < self.m_reducedDims[0]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
reducer.reduce(self.m_impl.coeff(input), accum);
}
}
};
template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
struct InnerMostDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
typename Self::CoeffReturnType accum = reducer.initialize();
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalize(accum);
}
};
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
}
typename Self::CoeffReturnType accum = reducer.initialize();
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalizeBoth(accum, p);
}
};
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
struct InnerMostDimPreserver {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
eigen_assert(false && "should never be called");
}
};
template <int DimIndex, typename Self, typename Op>
struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
}
}
};
template <typename Self, typename Op>
struct InnerMostDimPreserver<0, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
for (int j = 0; j < self.m_reducedDims[0]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
}
}
};
} // end namespace internal
template <typename Op, typename Dims, typename XprType>
class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
public:
typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
{ }
TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const XprType& expression() const { return m_expr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dims& dims() const { return m_dims; }
const Op& reducer() const { return m_reducer; }
protected:
typename XprType::Nested m_expr;
const Dims m_dims;
const Op m_reducer;
};
// Eval as rvalue
template<typename Op, typename Dims, typename ArgType, typename Device>
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
{
typedef TensorReductionOp<Op, Dims, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumReducedDims = internal::array_size<Dims>::value;
static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
typedef DSizes<Index, NumOutputDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
enum {
IsAligned = false,
PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_reducer(op.reducer())
{
EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
// Bitmap indicating if an input dimension is reduced or not.
array<bool, NumInputDims> reduced;
for (int i = 0; i < NumInputDims; ++i) {
reduced[i] = false;
}
for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op.dims()[i] >= 0);
eigen_assert(op.dims()[i] < NumInputDims);
reduced[op.dims()[i]] = true;
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int outputIndex = 0;
int reduceIndex = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (reduced[i]) {
m_reducedDims[reduceIndex] = input_dims[i];
++reduceIndex;
} else {
m_dimensions[outputIndex] = input_dims[i];
++outputIndex;
}
}
// Precompute output strides.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
for (int i = 1; i < NumOutputDims; ++i) {
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
}
} else {
m_outputStrides[NumOutputDims - 1] = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
}
}
// Precompute input strides.
array<Index, NumInputDims> input_strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
input_strides[0] = 1;
for (int i = 1; i < NumInputDims; ++i) {
input_strides[i] = input_strides[i-1] * input_dims[i-1];
}
} else {
input_strides[NumInputDims - 1] = 1;
for (int i = NumInputDims - 2; i >= 0; --i) {
input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
}
}
outputIndex = 0;
reduceIndex = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (reduced[i]) {
m_reducedStrides[reduceIndex] = input_strides[i];
++reduceIndex;
} else {
m_preservedStrides[outputIndex] = input_strides[i];
++outputIndex;
}
}
// Special case for full reductions
if (NumInputDims == NumReducedDims) {
m_dimensions[0] = 1;
m_preservedStrides[0] = internal::array_prod(input_dims);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
Op reducer(m_reducer);
if (ReducingInnerMostDims) {
const Index num_values_to_reduce =
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
num_values_to_reduce, reducer);
} else {
typename Self::CoeffReturnType accum = reducer.initialize();
internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
return reducer.finalize(accum);
}
}
// TODO(bsteiner): provide a more efficient implementation.
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
if (ReducingInnerMostDims) {
const Index num_values_to_reduce =
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
const Index firstIndex = firstInput(index);
for (Index i = 0; i < packetSize; ++i) {
Op reducer(m_reducer);
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
num_values_to_reduce, reducer);
}
} else if (PreservingInnerMostDims) {
const Index firstIndex = firstInput(index);
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
// TBD: extend this the the n innermost dimensions that we preserve.
if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
Op reducer(m_reducer);
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
return reducer.finalizePacket(accum);
} else {
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index + i);
}
}
} else {
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index + i);
}
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
private:
template <int, typename, typename> friend struct internal::GenericDimReducer;
template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
if (ReducingInnerMostDims) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return index * m_preservedStrides[0];
} else {
return index * m_preservedStrides[NumOutputDims - 1];
}
}
// TBD: optimize the case where we preserve the innermost dimensions.
Index startInput = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumOutputDims - 1; i > 0; --i) {
// This is index_i in the output tensor.
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
startInput += index * m_preservedStrides[0];
} else {
for (int i = 0; i < NumOutputDims - 1; ++i) {
// This is index_i in the output tensor.
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
startInput += index * m_preservedStrides[NumOutputDims - 1];
}
return startInput;
}
// Dimensions of the output of the operation.
Dimensions m_dimensions;
// Precomputed strides for the output tensor.
array<Index, NumOutputDims> m_outputStrides;
// Subset of strides of the input tensor for the non-reduced dimensions.
// Indexed by output dimensions.
array<Index, NumOutputDims> m_preservedStrides;
// Subset of strides of the input tensor for the reduced dimensions.
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedStrides;
// Size of the input dimensions that are reduced.
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims;
// Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl;
// Operation to apply for computing the reduction.
Op m_reducer;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H

View File

@ -0,0 +1,429 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
namespace Eigen {
namespace internal {
template <typename Dimensions, typename Scalar>
class TensorLazyBaseEvaluator {
public:
TensorLazyBaseEvaluator() : m_refcount(0) { }
virtual ~TensorLazyBaseEvaluator() { }
virtual const Dimensions& dimensions() const = 0;
virtual const Scalar* data() const = 0;
virtual const Scalar coeff(DenseIndex index) const = 0;
virtual Scalar& coeffRef(DenseIndex index) = 0;
void incrRefCount() { ++m_refcount; }
void decrRefCount() { --m_refcount; }
int refCount() const { return m_refcount; }
private:
// No copy, no assigment;
TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
int m_refcount;
};
static char dummy[8];
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
public:
// typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device) {
m_dims = m_impl.dimensions();
m_impl.evalSubExprsIfNeeded(NULL);
}
virtual ~TensorLazyEvaluatorReadOnly() {
m_impl.cleanup();
}
virtual const Dimensions& dimensions() const {
return m_dims;
}
virtual const Scalar* data() const {
return m_impl.data();
}
virtual const Scalar coeff(DenseIndex index) const {
return m_impl.coeff(index);
}
virtual Scalar& coeffRef(DenseIndex /*index*/) {
eigen_assert(false && "can't reference the coefficient of a rvalue");
return *reinterpret_cast<Scalar*>(dummy);
};
protected:
TensorEvaluator<Expr, Device> m_impl;
Dimensions m_dims;
};
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
public:
typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
typedef typename Base::Scalar Scalar;
TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
}
virtual ~TensorLazyEvaluatorWritable() {
}
virtual Scalar& coeffRef(DenseIndex index) {
return this->m_impl.coeffRef(index);
}
};
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
public:
typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
typedef typename Base::Scalar Scalar;
TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
}
virtual ~TensorLazyEvaluator() {
}
};
} // namespace internal
/** \class TensorRef
* \ingroup CXX11_Tensor_Module
*
* \brief A reference to a tensor expression
* The expression will be evaluated lazily (as much as possible).
*
*/
template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
{
public:
typedef TensorRef<PlainObjectType> Self;
typedef typename PlainObjectType::Base Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
typedef typename internal::traits<PlainObjectType>::Index Index;
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
typedef typename internal::packet_traits<Scalar>::type Packet;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
typedef Scalar* PointerType;
typedef PointerType PointerArgType;
static const Index NumIndices = PlainObjectType::NumIndices;
typedef typename PlainObjectType::Dimensions Dimensions;
enum {
IsAligned = false,
PacketAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
}
template <typename Expression>
EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
m_evaluator->incrRefCount();
}
template <typename Expression>
EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
unrefEvaluator();
m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
m_evaluator->incrRefCount();
return *this;
}
~TensorRef() {
unrefEvaluator();
}
TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
eigen_assert(m_evaluator->refCount() > 0);
m_evaluator->incrRefCount();
}
TensorRef& operator = (const TensorRef& other) {
if (this != &other) {
unrefEvaluator();
m_evaluator = other.m_evaluator;
eigen_assert(m_evaluator->refCount() > 0);
m_evaluator->incrRefCount();
}
return *this;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
{
return m_evaluator->coeff(index);
}
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
{
const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
return coeff(indices);
}
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
const std::size_t NumIndices = (sizeof...(otherIndices) + 1);
const array<Index, NumIndices> indices{{firstIndex, otherIndices...}};
return coeffRef(indices);
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
{
array<Index, 2> indices;
indices[0] = i0;
indices[1] = i1;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
{
array<Index, 3> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
{
array<Index, 4> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
array<Index, 5> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
indices[4] = i4;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
{
array<Index, 2> indices;
indices[0] = i0;
indices[1] = i1;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
{
array<Index, 3> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
array<Index, 4> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
{
array<Index, 5> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
indices[4] = i4;
return coeffRef(indices);
}
#endif
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
{
const Dimensions& dims = this->dimensions();
Index index = 0;
if (PlainObjectType::Options & RowMajor) {
index += indices[0];
for (int i = 1; i < NumIndices; ++i) {
index = index * dims[i] + indices[i];
}
} else {
index += indices[NumIndices-1];
for (int i = NumIndices-2; i >= 0; --i) {
index = index * dims[i] + indices[i];
}
}
return m_evaluator->coeff(index);
}
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
const Dimensions& dims = this->dimensions();
Index index = 0;
if (PlainObjectType::Options & RowMajor) {
index += indices[0];
for (int i = 1; i < NumIndices; ++i) {
index = index * dims[i] + indices[i];
}
} else {
index += indices[NumIndices-1];
for (int i = NumIndices-2; i >= 0; --i) {
index = index * dims[i] + indices[i];
}
}
return m_evaluator->coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
{
return m_evaluator->coeff(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
return m_evaluator->coeffRef(index);
}
private:
EIGEN_STRONG_INLINE void unrefEvaluator() {
if (m_evaluator) {
m_evaluator->decrRefCount();
if (m_evaluator->refCount() == 0) {
delete m_evaluator;
}
}
}
internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
};
// evaluator for rvalues
template<typename Derived, typename Device>
struct TensorEvaluator<const TensorRef<Derived>, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Packet Packet;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename Derived::Packet PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
enum {
IsAligned = false,
PacketAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
: m_ref(m)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_ref.coeff(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
return m_ref.coeffRef(index);
}
Scalar* data() const { return m_ref.data(); }
protected:
TensorRef<Derived> m_ref;
};
// evaluator for lvalues
template<typename Derived, typename Device>
struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Packet Packet;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename Derived::Packet PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
enum {
IsAligned = false,
PacketAccess = false,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
return this->m_ref.coeffRef(index);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H

View File

@ -0,0 +1,207 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
// Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
namespace Eigen {
/** \class TensorReverse
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reverse elements class.
*
*/
namespace internal {
template<typename ReverseDimensions, typename XprType>
struct traits<TensorReverseOp<ReverseDimensions,
XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename ReverseDimensions, typename XprType>
struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
{
typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
};
template<typename ReverseDimensions, typename XprType>
struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
{
typedef TensorReverseOp<ReverseDimensions, XprType> type;
};
} // end namespace internal
template<typename ReverseDimensions, typename XprType>
class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorReverseOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
StorageKind;
typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr,
const ReverseDimensions& reverse_dims)
: m_xpr(expr), m_reverse_dims(reverse_dims) {}
EIGEN_DEVICE_FUNC
const ReverseDimensions& reverse() const { return m_reverse_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const ReverseDimensions m_reverse_dims;
};
// Eval as rvalue
template<typename ReverseDimensions, typename ArgType, typename Device>
struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
{
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<ReverseDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
const Device& device)
: m_impl(op.expression(), device), m_reverse(op.reverse())
{
// Compute strides
m_dimensions = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
}
} else {
m_strides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
}
}
}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
Index idx = index / m_strides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
}
inputIndex += idx * m_strides[i] ;
}
if (m_reverse[0]) {
inputIndex += (m_dimensions[0] - index - 1);
} else {
inputIndex += index;
}
return m_impl.coeff(inputIndex);
} else {
for (int i = 0; i < NumDims - 1; ++i) {
Index idx = index / m_strides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
}
inputIndex += idx * m_strides[i] ;
}
if (m_reverse[NumDims-1]) {
inputIndex += (m_dimensions[NumDims-1] - index - 1);
} else {
inputIndex += index;
}
return m_impl.coeff(inputIndex);
}
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
// TODO(ndjaitly): write a better packing routine that uses
// local structure.
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type
values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
TensorEvaluator<ArgType, Device> m_impl;
ReverseDimensions m_reverse;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H

View File

@ -0,0 +1,259 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
namespace Eigen {
/** \class TensorShuffling
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor shuffling class.
*
*
*/
namespace internal {
template<typename Shuffle, typename XprType>
struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename Shuffle, typename XprType>
struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
{
typedef const TensorShufflingOp<Shuffle, XprType>& type;
};
template<typename Shuffle, typename XprType>
struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
{
typedef TensorShufflingOp<Shuffle, XprType> type;
};
} // end namespace internal
template<typename Shuffle, typename XprType>
class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorShufflingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
: m_xpr(expr), m_shuffle(shuffle) {}
EIGEN_DEVICE_FUNC
const Shuffle& shuffle() const { return m_shuffle; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const TensorShufflingOp& other)
{
typedef TensorAssignOp<TensorShufflingOp, const TensorShufflingOp> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorShufflingOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorShufflingOp, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
const Shuffle m_shuffle;
};
// Eval as rvalue
template<typename Shuffle, typename ArgType, typename Device>
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
{
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shuffle();
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] = input_dims[shuffle[i]];
}
array<Index, NumDims> inputStrides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
inputStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
}
} else {
inputStrides[NumDims - 1] = 1;
m_outputStrides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
}
}
for (int i = 0; i < NumDims; ++i) {
m_inputStrides[i] = inputStrides[shuffle[i]];
}
}
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[0];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[NumDims - 1];
}
}
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
// Eval as lvalue
template<typename Shuffle, typename ArgType, typename Device>
struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
: public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
{
typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = false,
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
for (int i = 0; i < packetSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H

View File

@ -30,39 +30,70 @@ namespace Eigen {
*
* \sa Tensor
*/
template<typename T, std::size_t NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename Dimensions = void> class TensorStorage;
// Pure fixed-size storage
template<typename T, DenseIndex NumIndices_, DenseIndex Size, int Options_, typename FixedDimensions>
class TensorStorage
{
private:
EIGEN_ALIGN_DEFAULT T m_data[Size];
FixedDimensions m_dimensions;
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorStorage() {
EIGEN_STATIC_ASSERT(Size == FixedDimensions::total_size, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T *data() { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T *data() const { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const FixedDimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
};
// pure-dynamic, but without specification of all dimensions explicitly
template<typename T, std::size_t NumIndices_, int Options_>
template<typename T, DenseIndex NumIndices_, int Options_>
class TensorStorage<T, NumIndices_, Dynamic, Options_, void>
: public TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
{
typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Base_;
public:
TensorStorage() = default;
TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
TensorStorage(TensorStorage<T, NumIndices_, Dynamic, Options_, void>&&) = default;
TensorStorage() { }
TensorStorage(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>& other) : Base_(other) { }
TensorStorage(internal::constructor_without_unaligned_array_assert) : Base_(internal::constructor_without_unaligned_array_assert()) {}
TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions) : Base_(size, dimensions) {}
// TensorStorage<T, NumIndices_, Dynamic, Options_, void>& operator=(const TensorStorage<T, NumIndices_, Dynamic, Options_, void>&) = default;
};
// pure dynamic
template<typename T, std::size_t NumIndices_, int Options_>
template<typename T, DenseIndex NumIndices_, int Options_>
class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type>
{
T *m_data;
std::array<DenseIndex, NumIndices_> m_dimensions;
DSizes<DenseIndex, NumIndices_> m_dimensions;
typedef TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_numeric_list_repeated<DenseIndex, NumIndices_, Dynamic>::type> Self_;
public:
TensorStorage() : m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
TensorStorage() : m_data(0), m_dimensions() {}
TensorStorage(internal::constructor_without_unaligned_array_assert)
: m_data(0), m_dimensions(internal::template repeat<NumIndices_, DenseIndex>(0)) {}
TensorStorage(DenseIndex size, const std::array<DenseIndex, NumIndices_>& dimensions)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
TensorStorage(const Self_& other)
TensorStorage(DenseIndex size, const array<DenseIndex, NumIndices_>& dimensions)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
TensorStorage(const Self_& other)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
, m_dimensions(other.m_dimensions)
{
@ -76,32 +107,19 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
}
return *this;
}
TensorStorage(Self_&& other)
: m_data(std::move(other.m_data)), m_dimensions(std::move(other.m_dimensions))
{
other.m_data = nullptr;
}
Self_& operator=(Self_&& other)
{
using std::swap;
swap(m_data, other.m_data);
swap(m_dimensions, other.m_dimensions);
return *this;
}
~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
void swap(Self_& other)
{ std::swap(m_data,other.m_data); std::swap(m_dimensions,other.m_dimensions); }
std::array<DenseIndex, NumIndices_> dimensions(void) const {return m_dimensions;}
void conservativeResize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<DenseIndex, NumIndices_>& dimensions() const {return m_dimensions;}
EIGEN_DEVICE_FUNC void resize(DenseIndex size, const array<DenseIndex, NumIndices_>& nbDimensions)
{
m_data = internal::conditional_aligned_realloc_new_auto<T,(Options_&DontAlign)==0>(m_data, size, internal::array_prod(m_dimensions));
m_dimensions = nbDimensions;
}
void resize(DenseIndex size, const std::array<DenseIndex, NumIndices_>& nbDimensions)
{
if(size != internal::array_prod(m_dimensions))
const DenseIndex currentSz = internal::array_prod(m_dimensions);
if(size != currentSz)
{
internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions));
internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
if (size)
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
else
@ -110,16 +128,13 @@ class TensorStorage<T, NumIndices_, Dynamic, Options_, typename internal::gen_nu
}
m_dimensions = nbDimensions;
}
const T *data() const { return m_data; }
T *data() { return m_data; }
};
// TODO: implement fixed-size stuff
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return m_dimensions.TotalSize(); }
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@ -0,0 +1,325 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
namespace Eigen {
/** \class TensorStriding
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor striding class.
*
*
*/
namespace internal {
template<typename Strides, typename XprType>
struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename packet_traits<Scalar>::type Packet;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename Strides, typename XprType>
struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
{
typedef const TensorStridingOp<Strides, XprType>& type;
};
template<typename Strides, typename XprType>
struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
{
typedef TensorStridingOp<Strides, XprType> type;
};
} // end namespace internal
template<typename Strides, typename XprType>
class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
typedef typename Eigen::internal::traits<TensorStridingOp>::Packet Packet;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
: m_xpr(expr), m_dims(dims) {}
EIGEN_DEVICE_FUNC
const Strides& strides() const { return m_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorStridingOp& operator = (const TensorStridingOp& other)
{
typedef TensorAssignOp<TensorStridingOp, const TensorStridingOp> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorStridingOp& operator = (const OtherDerived& other)
{
typedef TensorAssignOp<TensorStridingOp, const OtherDerived> Assign;
Assign assign(*this, other);
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
return *this;
}
protected:
typename XprType::Nested m_xpr;
const Strides m_dims;
};
// Eval as rvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
{
typedef TensorStridingOp<Strides, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
m_inputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_inputStrides[i-1] *= op.strides()[i-1];
}
m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
} else { // RowMajor
m_outputStrides[NumDims-1] = 1;
m_inputStrides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_inputStrides[i+1] *= op.strides()[i+1];
}
m_inputStrides[0] *= op.strides()[0];
}
}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
inputIndices[0] += idx0 * m_inputStrides[i];
inputIndices[1] += idx1 * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += indices[0] * m_inputStrides[0];
inputIndices[1] += indices[1] * m_inputStrides[0];
} else { // RowMajor
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
inputIndices[0] += idx0 * m_inputStrides[i];
inputIndices[1] += idx1 * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
}
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
EIGEN_ALIGN_DEFAULT typename internal::remove_const<CoeffReturnType>::type values[packetSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[packetSize-1] = m_impl.coeff(inputIndices[1]);
for (int i = 1; i < packetSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += index * m_inputStrides[0];
} else { // RowMajor
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += index * m_inputStrides[NumDims-1];
}
return inputIndex;
}
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
// Eval as lvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
: public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
{
typedef TensorStridingOp<Strides, ArgType> XprType;
typedef TensorEvaluator<const XprType, Device> Base;
// typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
// typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device) { }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::PacketReturnType PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < this->dimensions().TotalSize());
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + packetSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
inputIndices[0] += idx0 * this->m_inputStrides[i];
inputIndices[1] += idx1 * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += indices[0] * this->m_inputStrides[0];
inputIndices[1] += indices[1] * this->m_inputStrides[0];
} else { // RowMajor
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
inputIndices[0] += idx0 * this->m_inputStrides[i];
inputIndices[1] += idx1 * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
}
if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
}
else {
EIGEN_ALIGN_DEFAULT Scalar values[packetSize];
internal::pstore<Scalar, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
for (int i = 1; i < packetSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H

View File

@ -0,0 +1,256 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
namespace Eigen {
namespace internal {
template<typename Scalar, int Options>
class compute_tensor_flags
{
enum {
is_dynamic_size_storage = 1,
aligned_bit =
(
((Options&DontAlign)==0) && (
#if EIGEN_ALIGN_STATICALLY
(!is_dynamic_size_storage)
#else
0
#endif
||
#if EIGEN_ALIGN
is_dynamic_size_storage
#else
0
#endif
)
) ? AlignedBit : 0,
packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
};
public:
enum { ret = packet_access_bit | aligned_bit};
};
template<typename Scalar_, std::size_t NumIndices_, int Options_>
struct traits<Tensor<Scalar_, NumIndices_, Options_> >
{
typedef Scalar_ Scalar;
typedef Dense StorageKind;
typedef DenseIndex Index;
static const int NumDimensions = NumIndices_;
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
enum {
Options = Options_,
Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
};
};
template<typename Scalar_, typename Dimensions, int Options_>
struct traits<TensorFixedSize<Scalar_, Dimensions, Options_> >
{
typedef Scalar_ Scalar;
typedef Dense StorageKind;
typedef DenseIndex Index;
static const int NumDimensions = array_size<Dimensions>::value;
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
enum {
Options = Options_,
Flags = compute_tensor_flags<Scalar_, Options_>::ret | LvalueBit,
};
};
template<typename PlainObjectType, int Options_>
struct traits<TensorMap<PlainObjectType, Options_> >
: public traits<PlainObjectType>
{
typedef traits<PlainObjectType> BaseTraits;
typedef typename BaseTraits::Scalar Scalar;
typedef typename BaseTraits::StorageKind StorageKind;
typedef typename BaseTraits::Index Index;
static const int NumDimensions = BaseTraits::NumDimensions;
static const int Layout = BaseTraits::Layout;
enum {
Options = Options_,
Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
};
};
template<typename PlainObjectType>
struct traits<TensorRef<PlainObjectType> >
: public traits<PlainObjectType>
{
typedef traits<PlainObjectType> BaseTraits;
typedef typename BaseTraits::Scalar Scalar;
typedef typename BaseTraits::StorageKind StorageKind;
typedef typename BaseTraits::Index Index;
static const int NumDimensions = BaseTraits::NumDimensions;
static const int Layout = BaseTraits::Layout;
enum {
Options = BaseTraits::Options,
Flags = ((BaseTraits::Flags | LvalueBit) & ~AlignedBit) | (Options&Aligned ? AlignedBit : 0),
};
};
template<typename _Scalar, std::size_t NumIndices_, int Options>
struct eval<Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
{
typedef const Tensor<_Scalar, NumIndices_, Options>& type;
};
template<typename _Scalar, std::size_t NumIndices_, int Options>
struct eval<const Tensor<_Scalar, NumIndices_, Options>, Eigen::Dense>
{
typedef const Tensor<_Scalar, NumIndices_, Options>& type;
};
template<typename Scalar_, typename Dimensions, int Options>
struct eval<TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
};
template<typename Scalar_, typename Dimensions, int Options>
struct eval<const TensorFixedSize<Scalar_, Dimensions, Options>, Eigen::Dense>
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
};
template<typename PlainObjectType, int Options>
struct eval<TensorMap<PlainObjectType, Options>, Eigen::Dense>
{
typedef const TensorMap<PlainObjectType, Options>& type;
};
template<typename PlainObjectType, int Options>
struct eval<const TensorMap<PlainObjectType, Options>, Eigen::Dense>
{
typedef const TensorMap<PlainObjectType, Options>& type;
};
template<typename PlainObjectType>
struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
{
typedef const TensorRef<PlainObjectType>& type;
};
template<typename PlainObjectType>
struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
{
typedef const TensorRef<PlainObjectType>& type;
};
template <typename Scalar_, std::size_t NumIndices_, int Options_>
struct nested<Tensor<Scalar_, NumIndices_, Options_> >
{
typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
};
template <typename Scalar_, std::size_t NumIndices_, int Options_>
struct nested<const Tensor<Scalar_, NumIndices_, Options_> >
{
typedef const Tensor<Scalar_, NumIndices_, Options_>& type;
};
template <typename Scalar_, typename Dimensions, int Options>
struct nested<TensorFixedSize<Scalar_, Dimensions, Options> >
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
};
template <typename Scalar_, typename Dimensions, int Options>
struct nested<const TensorFixedSize<Scalar_, Dimensions, Options> >
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options>& type;
};
template <typename PlainObjectType, int Options>
struct nested<TensorMap<PlainObjectType, Options> >
{
typedef const TensorMap<PlainObjectType, Options>& type;
};
template <typename PlainObjectType, int Options>
struct nested<const TensorMap<PlainObjectType, Options> >
{
typedef const TensorMap<PlainObjectType, Options>& type;
};
template <typename PlainObjectType>
struct nested<TensorRef<PlainObjectType> >
{
typedef const TensorRef<PlainObjectType>& type;
};
template <typename PlainObjectType>
struct nested<const TensorRef<PlainObjectType> >
{
typedef const TensorRef<PlainObjectType>& type;
};
} // end namespace internal
// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
// R, B), and convolve it with a set of filters, which can also be presented as
// a tensor (D, K, K, M), where M is the number of filters, K is the filter
// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
// simplicity we assume that we always use square filters (which is usually the
// case in images), hence the two Ks in the tensor dimension. It also takes in
// a few additional parameters:
// Stride (S): The convolution stride is the offset between locations where we
// apply the filters. A larger stride means that the output will be
// spatially smaller.
// Padding (P): The padding we apply to the input tensor along the R and C
// dimensions. This is usually used to make sure that the spatial
// dimensions of the output matches our intention.
//
// Two types of padding are often used:
// SAME: The pad value is computed so that the output will have size
// R/S and C/S.
// VALID: no padding is carried out.
// When we do padding, the padded values at the padded locations are usually
// zero.
//
// The output dimensions for convolution, when given all the parameters above,
// are as follows:
// When Padding = SAME: the output size is (B, R', C', M), where
// R' = ceil(float(R) / float(S))
// C' = ceil(float(C) / float(S))
// where ceil is the ceiling function. The input tensor is padded with 0 as
// needed. The number of padded rows and columns are computed as:
// Pr = ((R' - 1) * S + K - R) / 2
// Pc = ((C' - 1) * S + K - C) / 2
// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
// This is where SAME comes from - the output has the same size as the input has.
// When Padding = VALID: the output size is computed as
// R' = ceil(float(R - K + 1) / float(S))
// C' = ceil(float(C - K + 1) / float(S))
// and the number of padded rows and columns are computed in the same way as in
// the SAME case.
// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
// Pc=0.
typedef enum {
PADDING_VALID = 1,
PADDING_SAME = 2,
} PaddingType;
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H

View File

@ -50,7 +50,7 @@ if(MPFR_FOUND)
include_directories(${MPFR_INCLUDES} ./mpreal)
ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
# ei_add_test(mpreal_support "" "${EIGEN_MPFR_TEST_LIBRARIES}" )
else()
ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
endif()
@ -95,12 +95,50 @@ ei_add_test(minres)
ei_add_test(levenberg_marquardt)
ei_add_test(kronecker_product)
option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." OFF)
option(EIGEN_TEST_CXX11 "Enable testing of C++11 features (e.g. Tensor module)." ON)
if(EIGEN_TEST_CXX11)
# FIXME: add C++11 compiler switch in some portable way
# (MSVC doesn't need any for example, so this will
# clash there)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.
ei_add_test(cxx11_meta "-std=c++0x")
ei_add_test(cxx11_tensor_simple "-std=c++0x")
ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
# ei_add_test(cxx11_tensor_symmetry "-std=c++0x")
ei_add_test(cxx11_tensor_assign "-std=c++0x")
ei_add_test(cxx11_tensor_dimension "-std=c++0x")
ei_add_test(cxx11_tensor_index_list "-std=c++0x")
ei_add_test(cxx11_tensor_comparisons "-std=c++0x")
ei_add_test(cxx11_tensor_contraction "-std=c++0x")
ei_add_test(cxx11_tensor_convolution "-std=c++0x")
ei_add_test(cxx11_tensor_expr "-std=c++0x")
ei_add_test(cxx11_tensor_forced_eval "-std=c++0x")
ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
ei_add_test(cxx11_tensor_const "-std=c++0x")
ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
ei_add_test(cxx11_tensor_of_complex "-std=c++0x")
ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
ei_add_test(cxx11_tensor_map "-std=c++0x")
ei_add_test(cxx11_tensor_broadcasting "-std=c++0x")
ei_add_test(cxx11_tensor_chipping "-std=c++0x")
ei_add_test(cxx11_tensor_concatenation "-std=c++0x")
ei_add_test(cxx11_tensor_morphing "-std=c++0x")
ei_add_test(cxx11_tensor_padding "-std=c++0x")
ei_add_test(cxx11_tensor_patch "-std=c++0x")
ei_add_test(cxx11_tensor_image_patch "-std=c++0x")
ei_add_test(cxx11_tensor_reduction "-std=c++0x")
ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
ei_add_test(cxx11_tensor_striding "-std=c++0x")
ei_add_test(cxx11_tensor_thread_pool "-std=c++0x")
ei_add_test(cxx11_tensor_ref "-std=c++0x")
ei_add_test(cxx11_tensor_random "-std=c++0x")
ei_add_test(cxx11_tensor_casts "-std=c++0x")
ei_add_test(cxx11_tensor_reverse "-std=c++0x")
ei_add_test(cxx11_tensor_layout_swap "-std=c++0x")
ei_add_test(cxx11_tensor_io "-std=c++0x")
# These tests needs nvcc
# ei_add_test(cxx11_tensor_device "-std=c++0x")
# ei_add_test(cxx11_tensor_cuda "-std=c++0x")
# ei_add_test(cxx11_tensor_contract_cuda "-std=c++0x")
endif()

View File

@ -0,0 +1,370 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
static void test_1d()
{
Tensor<int, 1> vec1(6);
Tensor<int, 1, RowMajor> vec2(6);
vec1(0) = 4; vec2(0) = 0;
vec1(1) = 8; vec2(1) = 1;
vec1(2) = 15; vec2(2) = 2;
vec1(3) = 16; vec2(3) = 3;
vec1(4) = 23; vec2(4) = 4;
vec1(5) = 42; vec2(5) = 5;
int col_major[6];
int row_major[6];
memset(col_major, 0, 6*sizeof(int));
memset(row_major, 0, 6*sizeof(int));
TensorMap<Tensor<int, 1>> vec3(col_major, 6);
TensorMap<Tensor<int, 1, RowMajor>> vec4(row_major, 6);
vec3 = vec1;
vec4 = vec2;
VERIFY_IS_EQUAL(vec3(0), 4);
VERIFY_IS_EQUAL(vec3(1), 8);
VERIFY_IS_EQUAL(vec3(2), 15);
VERIFY_IS_EQUAL(vec3(3), 16);
VERIFY_IS_EQUAL(vec3(4), 23);
VERIFY_IS_EQUAL(vec3(5), 42);
VERIFY_IS_EQUAL(vec4(0), 0);
VERIFY_IS_EQUAL(vec4(1), 1);
VERIFY_IS_EQUAL(vec4(2), 2);
VERIFY_IS_EQUAL(vec4(3), 3);
VERIFY_IS_EQUAL(vec4(4), 4);
VERIFY_IS_EQUAL(vec4(5), 5);
vec1.setZero();
vec2.setZero();
vec1 = vec3;
vec2 = vec4;
VERIFY_IS_EQUAL(vec1(0), 4);
VERIFY_IS_EQUAL(vec1(1), 8);
VERIFY_IS_EQUAL(vec1(2), 15);
VERIFY_IS_EQUAL(vec1(3), 16);
VERIFY_IS_EQUAL(vec1(4), 23);
VERIFY_IS_EQUAL(vec1(5), 42);
VERIFY_IS_EQUAL(vec2(0), 0);
VERIFY_IS_EQUAL(vec2(1), 1);
VERIFY_IS_EQUAL(vec2(2), 2);
VERIFY_IS_EQUAL(vec2(3), 3);
VERIFY_IS_EQUAL(vec2(4), 4);
VERIFY_IS_EQUAL(vec2(5), 5);
}
static void test_2d()
{
Tensor<int, 2> mat1(2,3);
Tensor<int, 2, RowMajor> mat2(2,3);
mat1(0,0) = 0;
mat1(0,1) = 1;
mat1(0,2) = 2;
mat1(1,0) = 3;
mat1(1,1) = 4;
mat1(1,2) = 5;
mat2(0,0) = 0;
mat2(0,1) = 1;
mat2(0,2) = 2;
mat2(1,0) = 3;
mat2(1,1) = 4;
mat2(1,2) = 5;
int col_major[6];
int row_major[6];
memset(col_major, 0, 6*sizeof(int));
memset(row_major, 0, 6*sizeof(int));
TensorMap<Tensor<int, 2>> mat3(row_major, 2, 3);
TensorMap<Tensor<int, 2, RowMajor>> mat4(col_major, 2, 3);
mat3 = mat1;
mat4 = mat2;
VERIFY_IS_EQUAL(mat3(0,0), 0);
VERIFY_IS_EQUAL(mat3(0,1), 1);
VERIFY_IS_EQUAL(mat3(0,2), 2);
VERIFY_IS_EQUAL(mat3(1,0), 3);
VERIFY_IS_EQUAL(mat3(1,1), 4);
VERIFY_IS_EQUAL(mat3(1,2), 5);
VERIFY_IS_EQUAL(mat4(0,0), 0);
VERIFY_IS_EQUAL(mat4(0,1), 1);
VERIFY_IS_EQUAL(mat4(0,2), 2);
VERIFY_IS_EQUAL(mat4(1,0), 3);
VERIFY_IS_EQUAL(mat4(1,1), 4);
VERIFY_IS_EQUAL(mat4(1,2), 5);
mat1.setZero();
mat2.setZero();
mat1 = mat3;
mat2 = mat4;
VERIFY_IS_EQUAL(mat1(0,0), 0);
VERIFY_IS_EQUAL(mat1(0,1), 1);
VERIFY_IS_EQUAL(mat1(0,2), 2);
VERIFY_IS_EQUAL(mat1(1,0), 3);
VERIFY_IS_EQUAL(mat1(1,1), 4);
VERIFY_IS_EQUAL(mat1(1,2), 5);
VERIFY_IS_EQUAL(mat2(0,0), 0);
VERIFY_IS_EQUAL(mat2(0,1), 1);
VERIFY_IS_EQUAL(mat2(0,2), 2);
VERIFY_IS_EQUAL(mat2(1,0), 3);
VERIFY_IS_EQUAL(mat2(1,1), 4);
VERIFY_IS_EQUAL(mat2(1,2), 5);
}
static void test_3d()
{
Tensor<int, 3> mat1(2,3,7);
Tensor<int, 3, RowMajor> mat2(2,3,7);
int val = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
mat2(i,j,k) = val;
val++;
}
}
}
int col_major[2*3*7];
int row_major[2*3*7];
memset(col_major, 0, 2*3*7*sizeof(int));
memset(row_major, 0, 2*3*7*sizeof(int));
TensorMap<Tensor<int, 3>> mat3(col_major, 2, 3, 7);
TensorMap<Tensor<int, 3, RowMajor>> mat4(row_major, 2, 3, 7);
mat3 = mat1;
mat4 = mat2;
val = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_EQUAL(mat3(i,j,k), val);
VERIFY_IS_EQUAL(mat4(i,j,k), val);
val++;
}
}
}
mat1.setZero();
mat2.setZero();
mat1 = mat3;
mat2 = mat4;
val = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_EQUAL(mat1(i,j,k), val);
VERIFY_IS_EQUAL(mat2(i,j,k), val);
val++;
}
}
}
}
static void test_same_type()
{
Tensor<int, 1> orig_tensor(5);
Tensor<int, 1> dest_tensor(5);
orig_tensor.setRandom();
dest_tensor.setRandom();
int* orig_data = orig_tensor.data();
int* dest_data = dest_tensor.data();
dest_tensor = orig_tensor;
VERIFY_IS_EQUAL(orig_tensor.data(), orig_data);
VERIFY_IS_EQUAL(dest_tensor.data(), dest_data);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i));
}
TensorFixedSize<int, Sizes<5> > orig_array;
TensorFixedSize<int, Sizes<5> > dest_array;
orig_array.setRandom();
dest_array.setRandom();
orig_data = orig_array.data();
dest_data = dest_array.data();
dest_array = orig_array;
VERIFY_IS_EQUAL(orig_array.data(), orig_data);
VERIFY_IS_EQUAL(dest_array.data(), dest_data);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_EQUAL(dest_array(i), orig_array(i));
}
int orig[5] = {1, 2, 3, 4, 5};
int dest[5] = {6, 7, 8, 9, 10};
TensorMap<Tensor<int, 1> > orig_map(orig, 5);
TensorMap<Tensor<int, 1> > dest_map(dest, 5);
orig_data = orig_map.data();
dest_data = dest_map.data();
dest_map = orig_map;
VERIFY_IS_EQUAL(orig_map.data(), orig_data);
VERIFY_IS_EQUAL(dest_map.data(), dest_data);
for (int i = 0; i < 5; ++i) {
VERIFY_IS_EQUAL(dest[i], i+1);
}
}
static void test_auto_resize()
{
Tensor<int, 1> tensor1;
Tensor<int, 1> tensor2(3);
Tensor<int, 1> tensor3(5);
Tensor<int, 1> tensor4(7);
Tensor<int, 1> new_tensor(5);
new_tensor.setRandom();
tensor1 = tensor2 = tensor3 = tensor4 = new_tensor;
VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0));
VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0));
VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0));
VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0));
for (int i = 0; i < new_tensor.dimension(0); ++i) {
VERIFY_IS_EQUAL(tensor1(i), new_tensor(i));
VERIFY_IS_EQUAL(tensor2(i), new_tensor(i));
VERIFY_IS_EQUAL(tensor3(i), new_tensor(i));
VERIFY_IS_EQUAL(tensor4(i), new_tensor(i));
}
}
static void test_compound_assign()
{
Tensor<int, 1> start_tensor(10);
Tensor<int, 1> offset_tensor(10);
start_tensor.setRandom();
offset_tensor.setRandom();
Tensor<int, 1> tensor = start_tensor;
tensor += offset_tensor;
for (int i = 0; i < 10; ++i) {
VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
}
tensor = start_tensor;
tensor -= offset_tensor;
for (int i = 0; i < 10; ++i) {
VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
}
tensor = start_tensor;
tensor *= offset_tensor;
for (int i = 0; i < 10; ++i) {
VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
}
tensor = start_tensor;
tensor /= offset_tensor;
for (int i = 0; i < 10; ++i) {
VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
}
}
static void test_std_initializers_tensor() {
#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
Tensor<int, 1> a(3);
a.setValues({0, 1, 2});
VERIFY_IS_EQUAL(a(0), 0);
VERIFY_IS_EQUAL(a(1), 1);
VERIFY_IS_EQUAL(a(2), 2);
// It fills the top-left slice.
a.setValues({10, 20});
VERIFY_IS_EQUAL(a(0), 10);
VERIFY_IS_EQUAL(a(1), 20);
VERIFY_IS_EQUAL(a(2), 2);
// Chaining.
Tensor<int, 1> a2(3);
a2 = a.setValues({100, 200, 300});
VERIFY_IS_EQUAL(a(0), 100);
VERIFY_IS_EQUAL(a(1), 200);
VERIFY_IS_EQUAL(a(2), 300);
VERIFY_IS_EQUAL(a2(0), 100);
VERIFY_IS_EQUAL(a2(1), 200);
VERIFY_IS_EQUAL(a2(2), 300);
Tensor<int, 2> b(2, 3);
b.setValues({{0, 1, 2}, {3, 4, 5}});
VERIFY_IS_EQUAL(b(0, 0), 0);
VERIFY_IS_EQUAL(b(0, 1), 1);
VERIFY_IS_EQUAL(b(0, 2), 2);
VERIFY_IS_EQUAL(b(1, 0), 3);
VERIFY_IS_EQUAL(b(1, 1), 4);
VERIFY_IS_EQUAL(b(1, 2), 5);
// It fills the top-left slice.
b.setValues({{10, 20}, {30}});
VERIFY_IS_EQUAL(b(0, 0), 10);
VERIFY_IS_EQUAL(b(0, 1), 20);
VERIFY_IS_EQUAL(b(0, 2), 2);
VERIFY_IS_EQUAL(b(1, 0), 30);
VERIFY_IS_EQUAL(b(1, 1), 4);
VERIFY_IS_EQUAL(b(1, 2), 5);
Eigen::Tensor<int, 3> c(3, 2, 4);
c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
{{10, 11, 12, 13}, {14, 15, 16, 17}},
{{20, 21, 22, 23}, {24, 25, 26, 27}}});
VERIFY_IS_EQUAL(c(0, 0, 0), 0);
VERIFY_IS_EQUAL(c(0, 0, 1), 1);
VERIFY_IS_EQUAL(c(0, 0, 2), 2);
VERIFY_IS_EQUAL(c(0, 0, 3), 3);
VERIFY_IS_EQUAL(c(0, 1, 0), 4);
VERIFY_IS_EQUAL(c(0, 1, 1), 5);
VERIFY_IS_EQUAL(c(0, 1, 2), 6);
VERIFY_IS_EQUAL(c(0, 1, 3), 7);
VERIFY_IS_EQUAL(c(1, 0, 0), 10);
VERIFY_IS_EQUAL(c(1, 0, 1), 11);
VERIFY_IS_EQUAL(c(1, 0, 2), 12);
VERIFY_IS_EQUAL(c(1, 0, 3), 13);
VERIFY_IS_EQUAL(c(1, 1, 0), 14);
VERIFY_IS_EQUAL(c(1, 1, 1), 15);
VERIFY_IS_EQUAL(c(1, 1, 2), 16);
VERIFY_IS_EQUAL(c(1, 1, 3), 17);
VERIFY_IS_EQUAL(c(2, 0, 0), 20);
VERIFY_IS_EQUAL(c(2, 0, 1), 21);
VERIFY_IS_EQUAL(c(2, 0, 2), 22);
VERIFY_IS_EQUAL(c(2, 0, 3), 23);
VERIFY_IS_EQUAL(c(2, 1, 0), 24);
VERIFY_IS_EQUAL(c(2, 1, 1), 25);
VERIFY_IS_EQUAL(c(2, 1, 2), 26);
VERIFY_IS_EQUAL(c(2, 1, 3), 27);
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
}
void test_cxx11_tensor_assign()
{
CALL_SUBTEST(test_1d());
CALL_SUBTEST(test_2d());
CALL_SUBTEST(test_3d());
CALL_SUBTEST(test_same_type());
CALL_SUBTEST(test_auto_resize());
CALL_SUBTEST(test_compound_assign());
CALL_SUBTEST(test_std_initializers_tensor());
}

View File

@ -0,0 +1,194 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
template <int DataLayout>
static void test_simple_broadcasting()
{
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
tensor.setRandom();
array<ptrdiff_t, 4> broadcasts;
broadcasts[0] = 1;
broadcasts[1] = 1;
broadcasts[2] = 1;
broadcasts[3] = 1;
Tensor<float, 4, DataLayout> no_broadcast;
no_broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3);
VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5);
VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l));
}
}
}
}
broadcasts[0] = 2;
broadcasts[1] = 3;
broadcasts[2] = 1;
broadcasts[3] = 4;
Tensor<float, 4, DataLayout> broadcast;
broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
VERIFY_IS_EQUAL(broadcast.dimension(3), 28);
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 28; ++l) {
VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l));
}
}
}
}
}
template <int DataLayout>
static void test_vectorized_broadcasting()
{
Tensor<float, 3, DataLayout> tensor(8,3,5);
tensor.setRandom();
array<ptrdiff_t, 3> broadcasts;
broadcasts[0] = 2;
broadcasts[1] = 3;
broadcasts[2] = 4;
Tensor<float, 3, DataLayout> broadcast;
broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 20; ++k) {
VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
}
}
}
tensor.resize(11,3,5);
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
for (int i = 0; i < 22; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 20; ++k) {
VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
}
}
}
}
template <int DataLayout>
static void test_static_broadcasting()
{
Tensor<float, 3, DataLayout> tensor(8,3,5);
tensor.setRandom();
#ifdef EIGEN_HAS_CONSTEXPR
Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
#else
Eigen::array<int, 3> broadcasts;
broadcasts[0] = 2;
broadcasts[1] = 3;
broadcasts[2] = 4;
#endif
Tensor<float, 3, DataLayout> broadcast;
broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 20; ++k) {
VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
}
}
}
tensor.resize(11,3,5);
tensor.setRandom();
broadcast = tensor.broadcast(broadcasts);
VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
for (int i = 0; i < 22; ++i) {
for (int j = 0; j < 9; ++j) {
for (int k = 0; k < 20; ++k) {
VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
}
}
}
}
template <int DataLayout>
static void test_fixed_size_broadcasting()
{
// Need to add a [] operator to the Size class for this to work
#if 0
Tensor<float, 1, DataLayout> t1(10);
t1.setRandom();
TensorFixedSize<float, Sizes<1>, DataLayout> t2;
t2 = t2.constant(20.0f);
Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
}
TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
}
#endif
}
void test_cxx11_tensor_broadcasting()
{
CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
CALL_SUBTEST(test_static_broadcasting<ColMajor>());
CALL_SUBTEST(test_static_broadcasting<RowMajor>());
CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
}

View File

@ -0,0 +1,41 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::array;
static void test_simple_cast()
{
Tensor<float, 2> ftensor(20,30);
ftensor.setRandom();
Tensor<char, 2> chartensor(20,30);
chartensor.setRandom();
Tensor<std::complex<float>, 2> cplextensor(20,30);
cplextensor.setRandom();
chartensor = ftensor.cast<char>();
cplextensor = ftensor.cast<std::complex<float>>();
for (int i = 0; i < 20; ++i) {
for (int j = 0; j < 30; ++j) {
VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float>>(ftensor(i,j)));
}
}
}
void test_cxx11_tensor_casts()
{
CALL_SUBTEST(test_simple_cast());
}

View File

@ -0,0 +1,397 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<int DataLayout>
static void test_simple_chip()
{
Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
tensor.setRandom();
Tensor<float, 4, DataLayout> chip1;
chip1 = tensor.template chip<0>(1);
VERIFY_IS_EQUAL(chip1.dimension(0), 3);
VERIFY_IS_EQUAL(chip1.dimension(1), 5);
VERIFY_IS_EQUAL(chip1.dimension(2), 7);
VERIFY_IS_EQUAL(chip1.dimension(3), 11);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
VERIFY_IS_EQUAL(chip2.dimension(0), 2);
VERIFY_IS_EQUAL(chip2.dimension(1), 5);
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
VERIFY_IS_EQUAL(chip3.dimension(0), 2);
VERIFY_IS_EQUAL(chip3.dimension(1), 3);
VERIFY_IS_EQUAL(chip3.dimension(2), 7);
VERIFY_IS_EQUAL(chip3.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
VERIFY_IS_EQUAL(chip4.dimension(0), 2);
VERIFY_IS_EQUAL(chip4.dimension(1), 3);
VERIFY_IS_EQUAL(chip4.dimension(2), 5);
VERIFY_IS_EQUAL(chip4.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
VERIFY_IS_EQUAL(chip5.dimension(0), 2);
VERIFY_IS_EQUAL(chip5.dimension(1), 3);
VERIFY_IS_EQUAL(chip5.dimension(2), 5);
VERIFY_IS_EQUAL(chip5.dimension(3), 7);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
}
}
}
}
}
template<int DataLayout>
static void test_dynamic_chip()
{
Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
tensor.setRandom();
Tensor<float, 4, DataLayout> chip1;
chip1 = tensor.chip(1, 0);
VERIFY_IS_EQUAL(chip1.dimension(0), 3);
VERIFY_IS_EQUAL(chip1.dimension(1), 5);
VERIFY_IS_EQUAL(chip1.dimension(2), 7);
VERIFY_IS_EQUAL(chip1.dimension(3), 11);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
VERIFY_IS_EQUAL(chip2.dimension(0), 2);
VERIFY_IS_EQUAL(chip2.dimension(1), 5);
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
VERIFY_IS_EQUAL(chip3.dimension(0), 2);
VERIFY_IS_EQUAL(chip3.dimension(1), 3);
VERIFY_IS_EQUAL(chip3.dimension(2), 7);
VERIFY_IS_EQUAL(chip3.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
VERIFY_IS_EQUAL(chip4.dimension(0), 2);
VERIFY_IS_EQUAL(chip4.dimension(1), 3);
VERIFY_IS_EQUAL(chip4.dimension(2), 5);
VERIFY_IS_EQUAL(chip4.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
}
}
Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
VERIFY_IS_EQUAL(chip5.dimension(0), 2);
VERIFY_IS_EQUAL(chip5.dimension(1), 3);
VERIFY_IS_EQUAL(chip5.dimension(2), 5);
VERIFY_IS_EQUAL(chip5.dimension(3), 7);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
}
}
}
}
}
template<int DataLayout>
static void test_chip_in_expr() {
Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
input1.setRandom();
Tensor<float, 4, DataLayout> input2(3,5,7,11);
input2.setRandom();
Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
float expected = input1(0,i,j,k,l) + input2(i,j,k,l);
VERIFY_IS_EQUAL(result(i,j,k,l), expected);
}
}
}
}
Tensor<float, 3, DataLayout> input3(3,7,11);
input3.setRandom();
Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 7; ++j) {
for (int k = 0; k < 11; ++k) {
float expected = input1(0,i,2,j,k) + input3(i,j,k);
VERIFY_IS_EQUAL(result2(i,j,k), expected);
}
}
}
}
template<int DataLayout>
static void test_chip_as_lvalue()
{
Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
input1.setRandom();
Tensor<float, 4, DataLayout> input2(3,5,7,11);
input2.setRandom();
Tensor<float, 5, DataLayout> tensor = input1;
tensor.template chip<0>(1) = input2;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (i != 1) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
}
}
}
}
}
}
Tensor<float, 4, DataLayout> input3(2,5,7,11);
input3.setRandom();
tensor = input1;
tensor.template chip<1>(1) = input3;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (j != 1) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
}
}
}
}
}
}
Tensor<float, 4, DataLayout> input4(2,3,7,11);
input4.setRandom();
tensor = input1;
tensor.template chip<2>(3) = input4;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (k != 3) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
}
}
}
}
}
}
Tensor<float, 4, DataLayout> input5(2,3,5,11);
input5.setRandom();
tensor = input1;
tensor.template chip<3>(4) = input5;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (l != 4) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
}
}
}
}
}
}
Tensor<float, 4, DataLayout> input6(2,3,5,7);
input6.setRandom();
tensor = input1;
tensor.template chip<4>(5) = input6;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (m != 5) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
}
}
}
}
}
}
Tensor<float, 5, DataLayout> input7(2,3,5,7,11);
input7.setRandom();
tensor = input1;
tensor.chip(0, 0) = input7.chip(0, 0);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
for (int m = 0; m < 11; ++m) {
if (i != 0) {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
} else {
VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
}
}
}
}
}
}
}
template<int DataLayout>
static void test_chip_raw_data()
{
Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
tensor.setRandom();
typedef TensorEvaluator<decltype(tensor.template chip<4>(3)), DefaultDevice> Evaluator4;
auto chip = Evaluator4(tensor.template chip<4>(3), DefaultDevice());
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
int chip_index;
if (DataLayout == ColMajor) {
chip_index = i + 2 * (j + 3 * (k + 5 * l));
} else {
chip_index = 11 * (l + 7 * (k + 5 * (j + 3 * i)));
}
VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
}
}
}
}
typedef TensorEvaluator<decltype(tensor.template chip<0>(0)), DefaultDevice> Evaluator0;
auto chip0 = Evaluator0(tensor.template chip<0>(0), DefaultDevice());
VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
typedef TensorEvaluator<decltype(tensor.template chip<1>(0)), DefaultDevice> Evaluator1;
auto chip1 = Evaluator1(tensor.template chip<1>(0), DefaultDevice());
VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
typedef TensorEvaluator<decltype(tensor.template chip<2>(0)), DefaultDevice> Evaluator2;
auto chip2 = Evaluator2(tensor.template chip<2>(0), DefaultDevice());
VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
typedef TensorEvaluator<decltype(tensor.template chip<3>(0)), DefaultDevice> Evaluator3;
auto chip3 = Evaluator3(tensor.template chip<3>(0), DefaultDevice());
VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
}
void test_cxx11_tensor_chipping()
{
CALL_SUBTEST(test_simple_chip<ColMajor>());
CALL_SUBTEST(test_simple_chip<RowMajor>());
CALL_SUBTEST(test_dynamic_chip<ColMajor>());
CALL_SUBTEST(test_dynamic_chip<RowMajor>());
CALL_SUBTEST(test_chip_in_expr<ColMajor>());
CALL_SUBTEST(test_chip_in_expr<RowMajor>());
CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
CALL_SUBTEST(test_chip_raw_data<ColMajor>());
CALL_SUBTEST(test_chip_raw_data<RowMajor>());
}

View File

@ -0,0 +1,84 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
static void test_orderings()
{
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
Tensor<bool, 3> lt(2,3,7);
Tensor<bool, 3> le(2,3,7);
Tensor<bool, 3> gt(2,3,7);
Tensor<bool, 3> ge(2,3,7);
mat1.setRandom();
mat2.setRandom();
lt = mat1 < mat2;
le = mat1 <= mat2;
gt = mat1 > mat2;
ge = mat1 >= mat2;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
}
}
}
}
static void test_equality()
{
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
mat1.setRandom();
mat2.setRandom();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
if (random() < 0.5) {
mat2(i,j,k) = mat1(i,j,k);
}
}
}
}
Tensor<bool, 3> eq(2,3,7);
Tensor<bool, 3> ne(2,3,7);
eq = (mat1 == mat2);
ne = (mat1 != mat2);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
}
}
}
}
void test_cxx11_tensor_comparisons()
{
CALL_SUBTEST(test_orderings());
CALL_SUBTEST(test_equality());
}

View File

@ -0,0 +1,116 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<int DataLayout>
static void test_dimension_failures()
{
Tensor<int, 3, DataLayout> left(2, 3, 1);
Tensor<int, 3, DataLayout> right(3, 3, 1);
left.setRandom();
right.setRandom();
// Okay; other dimensions are equal.
Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
// Dimension mismatches.
VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
// Axis > NumDims or < 0.
VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
}
template<int DataLayout>
static void test_static_dimension_failure()
{
Tensor<int, 2, DataLayout> left(2, 3);
Tensor<int, 3, DataLayout> right(2, 3, 1);
#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
// Technically compatible, but we static assert that the inputs have same
// NumDims.
Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
#endif
// This can be worked around in this case.
Tensor<int, 3, DataLayout> concatenation = left
.reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}})
.concatenate(right, 0);
Tensor<int, 2, DataLayout> alternative = left
.concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
}
template<int DataLayout>
static void test_simple_concatenation()
{
Tensor<int, 3, DataLayout> left(2, 3, 1);
Tensor<int, 3, DataLayout> right(2, 3, 1);
left.setRandom();
right.setRandom();
Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
}
for (int i = 2; i < 4; ++i) {
VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
}
}
concatenation = left.concatenate(right, 1);
VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
}
for (int j = 3; j < 6; ++j) {
VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
}
}
concatenation = left.concatenate(right, 2);
VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
}
}
}
// TODO(phli): Add test once we have a real vectorized implementation.
// static void test_vectorized_concatenation() {}
void test_cxx11_tensor_concatenation()
{
CALL_SUBTEST(test_dimension_failures<ColMajor>());
CALL_SUBTEST(test_dimension_failures<RowMajor>());
CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
CALL_SUBTEST(test_simple_concatenation<ColMajor>());
CALL_SUBTEST(test_simple_concatenation<RowMajor>());
// CALL_SUBTEST(test_vectorized_concatenation());
}

View File

@ -0,0 +1,39 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
static void test_simple_assign()
{
Tensor<int, 3> random(2,3,7);
random.setRandom();
TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
Tensor<int, 3> result(2,3,7);
result = constant;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
}
}
}
}
void test_cxx11_tensor_const()
{
CALL_SUBTEST(test_simple_assign());
}

View File

@ -0,0 +1,121 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
static void test_cuda_contraction(int m_size, int k_size, int n_size)
{
cout<<"Calling with ("<<m_size<<","<<k_size<<","<<n_size<<")"<<std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 2, DataLayout> t_left(Eigen::array<int, 2>(m_size, k_size));
Tensor<float, 2, DataLayout> t_right(Eigen::array<int, 2>(k_size, n_size));
Tensor<float, 2, DataLayout> t_result(Eigen::array<int, 2>(m_size, n_size));
Tensor<float, 2, DataLayout> t_result_gpu(Eigen::array<int, 2>(m_size, n_size));
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float* d_t_left;
float* d_t_right;
float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
if (fabs(t_result.data()[i] - t_result_gpu.data()[i]) >= 1e-4) {
cout << "mismatch detected at index " << i << ": " << t_result.data()[i]
<< " vs " << t_result_gpu.data()[i] << endl;
assert(false);
}
}
cudaFree((void*)d_t_left);
cudaFree((void*)d_t_right);
cudaFree((void*)d_t_result);
}
void test_cxx11_tensor_cuda()
{
cout<<"Calling contraction tests"<<std::endl;
CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, 128));
CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, 128));
for (int k = 32; k < 256; k++) {
CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, k, 128));
CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, k, 128));
}
for (int k = 32; k < 256; k++) {
CALL_SUBTEST(test_cuda_contraction<ColMajor>(128, 128, k));
CALL_SUBTEST(test_cuda_contraction<RowMajor>(128, 128, k));
}
for (int k = 32; k < 256; k++) {
CALL_SUBTEST(test_cuda_contraction<ColMajor>(k, 128, 128));
CALL_SUBTEST(test_cuda_contraction<RowMajor>(k, 128, 128));
}
int m_sizes[] = {31, 39, 63, 64, 65,
127, 129, 255, 257, 511,
512, 513, 1023, 1024, 1025 };
int n_sizes[] = {31, 39, 63, 64, 65,
127, 129, 255, 257, 511,
512, 513, 1023, 1024, 1025 };
int k_sizes[] = { 31, 39, 63, 64, 65,
95, 96, 127, 129, 255,
257, 511, 512, 513, 1023,
1024, 1025};
for (int i = 0; i <15; i++)
for (int j = 0; j < 15; j++)
for (int k = 0; k < 17; k++) {
CALL_SUBTEST(test_cuda_contraction<ColMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
CALL_SUBTEST(test_cuda_contraction<RowMajor>(m_sizes[i], n_sizes[j], k_sizes[k]));
}
}

View File

@ -0,0 +1,480 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::DefaultDevice;
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
static void test_evals()
{
Tensor<float, 2, DataLayout> mat1(2, 3);
Tensor<float, 2, DataLayout> mat2(2, 3);
Tensor<float, 2, DataLayout> mat3(3, 2);
mat1.setRandom();
mat2.setRandom();
mat3.setRandom();
Tensor<float, 2, DataLayout> mat4(3,3);
mat4.setZero();
Eigen::array<DimPair, 1> dims3({{DimPair(0, 0)}});
typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
eval.evalTo(mat4.data());
EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
Tensor<float, 2, DataLayout> mat5(2,2);
mat5.setZero();
Eigen::array<DimPair, 1> dims4({{DimPair(1, 1)}});
typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
eval2.evalTo(mat5.data());
EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
Tensor<float, 2, DataLayout> mat6(2,2);
mat6.setZero();
Eigen::array<DimPair, 1> dims6({{DimPair(1, 0)}});
typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
eval3.evalTo(mat6.data());
EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
}
template<int DataLayout>
static void test_scalar()
{
Tensor<float, 1, DataLayout> vec1({6});
Tensor<float, 1, DataLayout> vec2({6});
vec1.setRandom();
vec2.setRandom();
Tensor<float, 1, DataLayout> scalar(1);
scalar.setZero();
Eigen::array<DimPair, 1> dims({{DimPair(0, 0)}});
typedef TensorEvaluator<decltype(vec1.contract(vec2, dims)), DefaultDevice> Evaluator;
Evaluator eval(vec1.contract(vec2, dims), DefaultDevice());
eval.evalTo(scalar.data());
EIGEN_STATIC_ASSERT(Evaluator::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
float expected = 0.0f;
for (int i = 0; i < 6; ++i) {
expected += vec1(i) * vec2(i);
}
VERIFY_IS_APPROX(scalar(0), expected);
}
template<int DataLayout>
static void test_multidims()
{
Tensor<float, 3, DataLayout> mat1(2, 2, 2);
Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
mat1.setRandom();
mat2.setRandom();
Tensor<float, 3, DataLayout> mat3(2, 2, 2);
mat3.setZero();
Eigen::array<DimPair, 2> dims({{DimPair(1, 2), DimPair(2, 3)}});
typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
eval.evalTo(mat3.data());
EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
}
template<int DataLayout>
static void test_holes() {
Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
t1.setRandom();
t2.setRandom();
Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(3, 4)}});
Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
VERIFY_IS_EQUAL(result.dimension(0), 5);
VERIFY_IS_EQUAL(result.dimension(1), 7);
VERIFY_IS_EQUAL(result.dimension(2), 7);
VERIFY_IS_EQUAL(result.dimension(3), 11);
VERIFY_IS_EQUAL(result.dimension(4), 13);
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < 5; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 5; ++l) {
for (int m = 0; m < 5; ++m) {
VERIFY_IS_APPROX(result(i, j, k, l, m),
t1(0, i, j, 0) * t2(0, k, l, m, 0) +
t1(1, i, j, 0) * t2(1, k, l, m, 0) +
t1(0, i, j, 1) * t2(0, k, l, m, 1) +
t1(1, i, j, 1) * t2(1, k, l, m, 1) +
t1(0, i, j, 2) * t2(0, k, l, m, 2) +
t1(1, i, j, 2) * t2(1, k, l, m, 2));
}
}
}
}
}
}
template<int DataLayout>
static void test_full_redux()
{
Tensor<float, 2, DataLayout> t1(2, 2);
Tensor<float, 3, DataLayout> t2(2, 2, 2);
t1.setRandom();
t2.setRandom();
Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(1, 0, 0)
+ t1(0, 1) * t2(0, 1, 0) + t1(1, 1) * t2(1, 1, 0));
VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) + t1(1, 0) * t2(1, 0, 1)
+ t1(0, 1) * t2(0, 1, 1) + t1(1, 1) * t2(1, 1, 1));
dims[0] = DimPair(1, 0);
dims[1] = DimPair(2, 1);
result = t2.contract(t1, dims);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) + t1(1, 0) * t2(0, 1, 0)
+ t1(0, 1) * t2(0, 0, 1) + t1(1, 1) * t2(0, 1, 1));
VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) + t1(1, 0) * t2(1, 1, 0)
+ t1(0, 1) * t2(1, 0, 1) + t1(1, 1) * t2(1, 1, 1));
}
template<int DataLayout>
static void test_contraction_of_contraction()
{
Tensor<float, 2, DataLayout> t1(2, 2);
Tensor<float, 2, DataLayout> t2(2, 2);
Tensor<float, 2, DataLayout> t3(2, 2);
Tensor<float, 2, DataLayout> t4(2, 2);
t1.setRandom();
t2.setRandom();
t3.setRandom();
t4.setRandom();
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
auto contract1 = t1.contract(t2, dims);
auto diff = t3 - contract1;
auto contract2 = t1.contract(t4, dims);
Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_EQUAL(result.dimension(1), 2);
Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
m4(t4.data(), 2, 2);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
expected = (m1 * m4) * (m3 - m1 * m2);
VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
}
template<int DataLayout>
static void test_expr()
{
Tensor<float, 2, DataLayout> mat1(2, 3);
Tensor<float, 2, DataLayout> mat2(3, 2);
mat1.setRandom();
mat2.setRandom();
Tensor<float, 2, DataLayout> mat3(2,2);
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
mat3 = mat1.contract(mat2, dims);
VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
}
template<int DataLayout>
static void test_out_of_order_contraction()
{
Tensor<float, 3, DataLayout> mat1(2, 2, 2);
Tensor<float, 3, DataLayout> mat2(2, 2, 2);
mat1.setRandom();
mat2.setRandom();
Tensor<float, 2, DataLayout> mat3(2, 2);
Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(0, 2)}});
mat3 = mat1.contract(mat2, dims);
VERIFY_IS_APPROX(mat3(0, 0),
mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
VERIFY_IS_APPROX(mat3(1, 0),
mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
VERIFY_IS_APPROX(mat3(0, 1),
mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
VERIFY_IS_APPROX(mat3(1, 1),
mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
Eigen::array<DimPair, 2> dims2({{DimPair(0, 2), DimPair(2, 0)}});
mat3 = mat1.contract(mat2, dims2);
VERIFY_IS_APPROX(mat3(0, 0),
mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
VERIFY_IS_APPROX(mat3(1, 0),
mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
VERIFY_IS_APPROX(mat3(0, 1),
mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
VERIFY_IS_APPROX(mat3(1, 1),
mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
}
template<int DataLayout>
static void test_consistency()
{
// this does something like testing (A*B)^T = (B^T * A^T)
Tensor<float, 3, DataLayout> mat1(4, 3, 5);
Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
mat1.setRandom();
mat2.setRandom();
Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
// contract on dimensions of size 4 and 3
Eigen::array<DimPair, 2> dims1({{DimPair(0, 4), DimPair(1, 0)}});
Eigen::array<DimPair, 2> dims2({{DimPair(4, 0), DimPair(0, 1)}});
mat3 = mat1.contract(mat2, dims1);
mat4 = mat2.contract(mat1, dims2);
// check that these are equal except for ordering of dimensions
if (DataLayout == ColMajor) {
for (size_t i = 0; i < 5; i++) {
for (size_t j = 0; j < 10; j++) {
VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
}
}
} else {
// Row major
for (size_t i = 0; i < 5; i++) {
for (size_t j = 0; j < 10; j++) {
VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
}
}
}
}
template<int DataLayout>
static void test_large_contraction()
{
Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
t_left.setRandom();
t_right.setRandom();
// Add a little offset so that the results won't be close to zero.
t_left += t_left.constant(1.0f);
t_right += t_right.constant(1.0f);
typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
MapXf m_left(t_left.data(), 1500, 248);
MapXf m_right(t_right.data(), 248, 1400);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
// this contraction should be equivalent to a single matrix multiplication
Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
// compute results by separate methods
t_result = t_left.contract(t_right, dims);
m_result = m_left * m_right;
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
VERIFY(&t_result.data()[i] != &m_result.data()[i]);
VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
}
}
template<int DataLayout>
static void test_matrix_vector()
{
Tensor<float, 2, DataLayout> t_left(30, 50);
Tensor<float, 1, DataLayout> t_right(50);
Tensor<float, 1, DataLayout> t_result(30);
t_left.setRandom();
t_right.setRandom();
typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
MapXf m_left(t_left.data(), 30, 50);
MapXf m_right(t_right.data(), 50, 1);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
// this contraction should be equivalent to a single matrix multiplication
Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
// compute results by separate methods
t_result = t_left.contract(t_right, dims);
m_result = m_left * m_right;
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
}
}
template<int DataLayout>
static void test_tensor_vector()
{
Tensor<float, 3, DataLayout> t_left(7, 13, 17);
Tensor<float, 2, DataLayout> t_right(1, 7);
t_left.setRandom();
t_right.setRandom();
typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
MapXf m_left(t_left.data(), 7, 13*17);
MapXf m_right(t_right.data(), 1, 7);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
}
}
template<int DataLayout>
static void test_small_blocking_factors()
{
Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
t_left.setRandom();
t_right.setRandom();
// Add a little offset so that the results won't be close to zero.
t_left += t_left.constant(1.0f);
t_right += t_right.constant(1.0f);
// Force the cache sizes, which results in smaller blocking factors.
Eigen::setCpuCacheSizes(896, 1920, 2944);
// this contraction should be equivalent to a single matrix multiplication
Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
Tensor<float, 5, DataLayout> t_result;
t_result = t_left.contract(t_right, dims);
// compute result using a simple eigen matrix product
Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
}
}
void test_cxx11_tensor_contraction()
{
CALL_SUBTEST(test_evals<ColMajor>());
CALL_SUBTEST(test_evals<RowMajor>());
CALL_SUBTEST(test_scalar<ColMajor>());
CALL_SUBTEST(test_scalar<RowMajor>());
CALL_SUBTEST(test_multidims<ColMajor>());
CALL_SUBTEST(test_multidims<RowMajor>());
CALL_SUBTEST(test_holes<ColMajor>());
CALL_SUBTEST(test_holes<RowMajor>());
CALL_SUBTEST(test_full_redux<ColMajor>());
CALL_SUBTEST(test_full_redux<RowMajor>());
CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
CALL_SUBTEST(test_expr<ColMajor>());
CALL_SUBTEST(test_expr<RowMajor>());
CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
CALL_SUBTEST(test_consistency<ColMajor>());
CALL_SUBTEST(test_consistency<RowMajor>());
CALL_SUBTEST(test_large_contraction<ColMajor>());
CALL_SUBTEST(test_large_contraction<RowMajor>());
CALL_SUBTEST(test_matrix_vector<ColMajor>());
CALL_SUBTEST(test_matrix_vector<RowMajor>());
CALL_SUBTEST(test_tensor_vector<ColMajor>());
CALL_SUBTEST(test_tensor_vector<RowMajor>());
CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
}

View File

@ -0,0 +1,141 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::DefaultDevice;
static void test_evals()
{
Tensor<float, 2> input(3, 3);
Tensor<float, 1> kernel(2);
input.setRandom();
kernel.setRandom();
Tensor<float, 2> result(2,3);
result.setZero();
Eigen::array<Tensor<float, 2>::Index, 1> dims3({0});
typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
eval.evalTo(result.data());
EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0
VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2
VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4
VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1
VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3
VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
}
static void test_expr()
{
Tensor<float, 2> input(3, 3);
Tensor<float, 2> kernel(2, 2);
input.setRandom();
kernel.setRandom();
Tensor<float, 2> result(2,2);
Eigen::array<ptrdiff_t, 2> dims({0, 1});
result = input.convolve(kernel, dims);
VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
}
static void test_modes() {
Tensor<float, 1> input(3);
Tensor<float, 1> kernel(3);
input(0) = 1.0f;
input(1) = 2.0f;
input(2) = 3.0f;
kernel(0) = 0.5f;
kernel(1) = 1.0f;
kernel(2) = 0.0f;
const Eigen::array<ptrdiff_t, 1> dims{{0}};
Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
// Emulate VALID mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(0, 0);
Tensor<float, 1> valid(1);
valid = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(valid.dimension(0), 1);
VERIFY_IS_APPROX(valid(0), 2.5f);
// Emulate SAME mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(1, 1);
Tensor<float, 1> same(3);
same = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(same.dimension(0), 3);
VERIFY_IS_APPROX(same(0), 1.0f);
VERIFY_IS_APPROX(same(1), 2.5f);
VERIFY_IS_APPROX(same(2), 4.0f);
// Emulate FULL mode (as defined in
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
padding[0] = std::make_pair(2, 2);
Tensor<float, 1> full(5);
full = input.pad(padding).convolve(kernel, dims);
VERIFY_IS_EQUAL(full.dimension(0), 5);
VERIFY_IS_APPROX(full(0), 0.0f);
VERIFY_IS_APPROX(full(1), 1.0f);
VERIFY_IS_APPROX(full(2), 2.5f);
VERIFY_IS_APPROX(full(3), 4.0f);
VERIFY_IS_APPROX(full(4), 1.5f);
}
static void test_strides() {
Tensor<float, 1> input(13);
Tensor<float, 1> kernel(3);
input.setRandom();
kernel.setRandom();
const Eigen::array<ptrdiff_t, 1> dims{{0}};
const Eigen::array<ptrdiff_t, 1> stride_of_3{{3}};
const Eigen::array<ptrdiff_t, 1> stride_of_2{{2}};
Tensor<float, 1> result;
result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
VERIFY_IS_EQUAL(result.dimension(0), 2);
VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
input(6)*kernel(2)));
VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
input(12)*kernel(2)));
}
void test_cxx11_tensor_convolution()
{
CALL_SUBTEST(test_evals());
CALL_SUBTEST(test_expr());
CALL_SUBTEST(test_modes());
CALL_SUBTEST(test_strides());
}

View File

@ -0,0 +1,514 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// TODO(mdevin): Free the cuda memory.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cuda
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
void test_cuda_elementwise_small() {
Tensor<float, 1> in1(Eigen::array<int, 1>(2));
Tensor<float, 1> in2(Eigen::array<int, 1>(2));
Tensor<float, 1> out(Eigen::array<int, 1>(2));
in1.setRandom();
in2.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_in2), in2_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
d_in1, Eigen::array<int, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
d_in2, Eigen::array<int, 1>(2));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
d_out, Eigen::array<int, 1>(2));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 2; ++i) {
VERIFY_IS_APPROX(
out(Eigen::array<int, 1>(i)),
in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
}
}
void test_cuda_elementwise()
{
Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
in1.setRandom();
in2.setRandom();
in3.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t in3_bytes = in3.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_in3;
float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_in2), in2_bytes);
cudaMalloc((void**)(&d_in3), in3_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) {
VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
}
}
}
}
void test_cuda_reduction()
{
Tensor<float, 4> in1(Eigen::array<int, 4>(72,53,97,113));
Tensor<float, 2> out(Eigen::array<int, 2>(72,97));
in1.setRandom();
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, Eigen::array<int, 4>(72,53,97,113));
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, Eigen::array<int, 2>(72,97));
array<int, 2> reduction_axis;
reduction_axis[0] = 1;
reduction_axis[1] = 3;
gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 97; ++j) {
float expected = 0;
for (int k = 0; k < 53; ++k) {
for (int l = 0; l < 113; ++l) {
expected =
std::max<float>(expected, in1(Eigen::array<int, 4>(i, k, j, l)));
}
}
VERIFY_IS_APPROX(out(Eigen::array<int, 2>(i,j)), expected);
}
}
}
template<int DataLayout>
static void test_cuda_contraction()
{
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 4, DataLayout> t_left(Eigen::array<int, 4>(6, 50, 3, 31));
Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float* d_t_left;
float* d_t_right;
float* d_t_result;
cudaMalloc((void**)(&d_t_left), t_left_bytes);
cudaMalloc((void**)(&d_t_right), t_right_bytes);
cudaMalloc((void**)(&d_t_result), t_result_bytes);
cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> >
gpu_t_left(d_t_left, Eigen::array<int, 4>(6, 50, 3, 31));
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
gpu_t_right(d_t_right, Eigen::array<int, 5>(3, 31, 7, 20, 1));
Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> >
gpu_t_result(d_t_result, Eigen::array<int, 5>(6, 50, 7, 20, 1));
typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
MapXf m_left(t_left.data(), 300, 93);
MapXf m_right(t_right.data(), 93, 140);
Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
typedef Tensor<float, 1>::DimensionPair DimPair;
Eigen::array<DimPair, 2> dims;
dims[0] = DimPair(2, 0);
dims[1] = DimPair(3, 1);
m_result = m_left * m_right;
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
for (size_t i = 0; i < t_result.dimensions().TotalSize(); i++) {
if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4) {
cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << endl;
assert(false);
}
}
}
static void test_cuda_convolution_1d()
{
Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
Tensor<float, 1> kernel(Eigen::array<int, 1>(4));
Tensor<float, 4> out(Eigen::array<int, 4>(74,34,11,137));
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
cudaMalloc((void**)(&d_input), input_bytes);
cudaMalloc((void**)(&d_kernel), kernel_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
Eigen::TensorMap<Eigen::Tensor<float, 1> > gpu_kernel(d_kernel, Eigen::array<int, 1>(4));
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,34,11,137));
Eigen::array<int, 1> dims(1);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 34; ++j) {
for (int k = 0; k < 11; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(Eigen::array<int, 4>(i,j,k,l));
const float expected = input(Eigen::array<int, 4>(i,j+0,k,l)) * kernel(Eigen::array<int, 1>(0)) +
input(Eigen::array<int, 4>(i,j+1,k,l)) * kernel(Eigen::array<int, 1>(1)) +
input(Eigen::array<int, 4>(i,j+2,k,l)) * kernel(Eigen::array<int, 1>(2)) +
input(Eigen::array<int, 4>(i,j+3,k,l)) * kernel(Eigen::array<int, 1>(3));
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
static void test_cuda_convolution_2d()
{
Tensor<float, 4> input(Eigen::array<int, 4>(74,37,11,137));
Tensor<float, 2> kernel(Eigen::array<int, 2>(3,4));
Tensor<float, 4> out(Eigen::array<int, 4>(74,35,8,137));
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
cudaMalloc((void**)(&d_input), input_bytes);
cudaMalloc((void**)(&d_kernel), kernel_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_input(d_input, Eigen::array<int, 4>(74,37,11,137));
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_kernel(d_kernel, Eigen::array<int, 2>(3,4));
Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_out(d_out, Eigen::array<int, 4>(74,35,8,137));
Eigen::array<int, 2> dims(1,2);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 137; ++l) {
const float result = out(Eigen::array<int, 4>(i,j,k,l));
const float expected = input(Eigen::array<int, 4>(i,j+0,k+0,l)) * kernel(Eigen::array<int, 2>(0,0)) +
input(Eigen::array<int, 4>(i,j+1,k+0,l)) * kernel(Eigen::array<int, 2>(1,0)) +
input(Eigen::array<int, 4>(i,j+2,k+0,l)) * kernel(Eigen::array<int, 2>(2,0)) +
input(Eigen::array<int, 4>(i,j+0,k+1,l)) * kernel(Eigen::array<int, 2>(0,1)) +
input(Eigen::array<int, 4>(i,j+1,k+1,l)) * kernel(Eigen::array<int, 2>(1,1)) +
input(Eigen::array<int, 4>(i,j+2,k+1,l)) * kernel(Eigen::array<int, 2>(2,1)) +
input(Eigen::array<int, 4>(i,j+0,k+2,l)) * kernel(Eigen::array<int, 2>(0,2)) +
input(Eigen::array<int, 4>(i,j+1,k+2,l)) * kernel(Eigen::array<int, 2>(1,2)) +
input(Eigen::array<int, 4>(i,j+2,k+2,l)) * kernel(Eigen::array<int, 2>(2,2)) +
input(Eigen::array<int, 4>(i,j+0,k+3,l)) * kernel(Eigen::array<int, 2>(0,3)) +
input(Eigen::array<int, 4>(i,j+1,k+3,l)) * kernel(Eigen::array<int, 2>(1,3)) +
input(Eigen::array<int, 4>(i,j+2,k+3,l)) * kernel(Eigen::array<int, 2>(2,3));
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
static void test_cuda_convolution_3d()
{
Tensor<float, 5> input(Eigen::array<int, 5>(74,37,11,137,17));
Tensor<float, 3> kernel(Eigen::array<int, 3>(3,4,2));
Tensor<float, 5> out(Eigen::array<int, 5>(74,35,8,136,17));
input = input.constant(10.0f) + input.random();
kernel = kernel.constant(7.0f) + kernel.random();
std::size_t input_bytes = input.size() * sizeof(float);
std::size_t kernel_bytes = kernel.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_input;
float* d_kernel;
float* d_out;
cudaMalloc((void**)(&d_input), input_bytes);
cudaMalloc((void**)(&d_kernel), kernel_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_input(d_input, Eigen::array<int, 5>(74,37,11,137,17));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_kernel(d_kernel, Eigen::array<int, 3>(3,4,2));
Eigen::TensorMap<Eigen::Tensor<float, 5> > gpu_out(d_out, Eigen::array<int, 5>(74,35,8,136,17));
Eigen::array<int, 3> dims(1,2,3);
gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 74; ++i) {
for (int j = 0; j < 35; ++j) {
for (int k = 0; k < 8; ++k) {
for (int l = 0; l < 136; ++l) {
for (int m = 0; m < 17; ++m) {
const float result = out(Eigen::array<int, 5>(i,j,k,l,m));
const float expected = input(Eigen::array<int, 5>(i,j+0,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(0,0,0)) +
input(Eigen::array<int, 5>(i,j+1,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(1,0,0)) +
input(Eigen::array<int, 5>(i,j+2,k+0,l+0,m)) * kernel(Eigen::array<int, 3>(2,0,0)) +
input(Eigen::array<int, 5>(i,j+0,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(0,1,0)) +
input(Eigen::array<int, 5>(i,j+1,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(1,1,0)) +
input(Eigen::array<int, 5>(i,j+2,k+1,l+0,m)) * kernel(Eigen::array<int, 3>(2,1,0)) +
input(Eigen::array<int, 5>(i,j+0,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(0,2,0)) +
input(Eigen::array<int, 5>(i,j+1,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(1,2,0)) +
input(Eigen::array<int, 5>(i,j+2,k+2,l+0,m)) * kernel(Eigen::array<int, 3>(2,2,0)) +
input(Eigen::array<int, 5>(i,j+0,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(0,3,0)) +
input(Eigen::array<int, 5>(i,j+1,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(1,3,0)) +
input(Eigen::array<int, 5>(i,j+2,k+3,l+0,m)) * kernel(Eigen::array<int, 3>(2,3,0)) +
input(Eigen::array<int, 5>(i,j+0,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(0,0,1)) +
input(Eigen::array<int, 5>(i,j+1,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(1,0,1)) +
input(Eigen::array<int, 5>(i,j+2,k+0,l+1,m)) * kernel(Eigen::array<int, 3>(2,0,1)) +
input(Eigen::array<int, 5>(i,j+0,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(0,1,1)) +
input(Eigen::array<int, 5>(i,j+1,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(1,1,1)) +
input(Eigen::array<int, 5>(i,j+2,k+1,l+1,m)) * kernel(Eigen::array<int, 3>(2,1,1)) +
input(Eigen::array<int, 5>(i,j+0,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(0,2,1)) +
input(Eigen::array<int, 5>(i,j+1,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(1,2,1)) +
input(Eigen::array<int, 5>(i,j+2,k+2,l+1,m)) * kernel(Eigen::array<int, 3>(2,2,1)) +
input(Eigen::array<int, 5>(i,j+0,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(0,3,1)) +
input(Eigen::array<int, 5>(i,j+1,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(1,3,1)) +
input(Eigen::array<int, 5>(i,j+2,k+3,l+1,m)) * kernel(Eigen::array<int, 3>(2,3,1));
VERIFY_IS_APPROX(result, expected);
}
}
}
}
}
}
static float* CudaCopyFloat(float* data, int size) {
const int nbytes = size * sizeof(float);
float* result = NULL;
if (cudaMalloc((void**)(&result), nbytes) != cudaSuccess) {
return NULL;
} else {
if (data != NULL) {
cudaMemcpy(result, data, nbytes, cudaMemcpyHostToDevice);
}
return result;
}
}
static void test_cuda_constant_broadcast()
{
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Tensor<float, 1> t1(10);
for (int i = 0; i < 10; ++i) {
t1(i) = 10.0f * i;
}
float* t1_cuda = CudaCopyFloat(t1.data(), t1.size());
Eigen::TensorMap<Eigen::Tensor<float, 1> > t1_gpu(t1_cuda, 10);
Tensor<float, 1> t2(1);
t2 = t2.constant(20.0f);
float* t2_cuda = CudaCopyFloat(t2.data(), t2.size());
Eigen::TensorMap<Eigen::TensorFixedSize<float, Sizes<1> > > t2_gpu(t2_cuda, 1);
float* t3_cuda = CudaCopyFloat(NULL, 10);
Eigen::TensorMap<Eigen::Tensor<float, 1> > t3_gpu(t3_cuda, 10);
t3_gpu.device(gpu_device) =
t1_gpu + t2_gpu.broadcast(Eigen::array<int, 1>(10));
Eigen::Tensor<float, 1> t3(10);
cudaMemcpy(t3.data(), t3_gpu.data(), 10 * sizeof(float),
cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; ++i) {
VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
}
}
void test_cuda_cast()
{
Tensor<double, 3> in(Eigen::array<int, 3>(72,53,97));
Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
in.setRandom();
std::size_t in_bytes = in.size() * sizeof(double);
std::size_t out_bytes = out.size() * sizeof(float);
double* d_in;
float* d_out;
cudaMalloc((void**)(&d_in), in_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
cudaStream_t stream;
assert(cudaStreamCreate(&stream) == cudaSuccess);
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3> > gpu_in(d_in, Eigen::array<int, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
gpu_out.device(gpu_device) = gpu_in.template cast<float>();
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
for (int i = 0; i < 72; ++i) {
for (int j = 0; j < 53; ++j) {
for (int k = 0; k < 97; ++k) {
VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), static_cast<float>(in(Eigen::array<int, 3>(i,j,k))));
}
}
}
}
void test_cxx11_tensor_cuda()
{
CALL_SUBTEST(test_cuda_elementwise_small());
CALL_SUBTEST(test_cuda_elementwise());
CALL_SUBTEST(test_cuda_reduction());
CALL_SUBTEST(test_cuda_contraction<ColMajor>());
CALL_SUBTEST(test_cuda_contraction<RowMajor>());
CALL_SUBTEST(test_cuda_convolution_1d());
CALL_SUBTEST(test_cuda_convolution_2d());
CALL_SUBTEST(test_cuda_convolution_3d());
CALL_SUBTEST(test_cuda_constant_broadcast());
CALL_SUBTEST(test_cuda_cast());
}

View File

@ -0,0 +1,391 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_device
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
// Context for evaluation on cpu
struct CPUContext {
CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
kernel_1d_(0) = 3.14f;
kernel_1d_(1) = 2.7f;
kernel_2d_(0,0) = 3.14f;
kernel_2d_(1,0) = 2.7f;
kernel_2d_(0,1) = 0.2f;
kernel_2d_(1,1) = 7.0f;
kernel_3d_(0,0,0) = 3.14f;
kernel_3d_(0,1,0) = 2.7f;
kernel_3d_(0,0,1) = 0.2f;
kernel_3d_(0,1,1) = 7.0f;
kernel_3d_(1,0,0) = -1.0f;
kernel_3d_(1,1,0) = -0.3f;
kernel_3d_(1,0,1) = -0.7f;
kernel_3d_(1,1,1) = -0.5f;
}
const Eigen::DefaultDevice& device() const { return cpu_device_; }
const Eigen::Tensor<float, 3>& in1() const { return in1_; }
const Eigen::Tensor<float, 3>& in2() const { return in2_; }
Eigen::Tensor<float, 3>& out() { return out_; }
const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
private:
const Eigen::Tensor<float, 3>& in1_;
const Eigen::Tensor<float, 3>& in2_;
Eigen::Tensor<float, 3>& out_;
Eigen::Tensor<float, 1> kernel_1d_;
Eigen::Tensor<float, 2> kernel_2d_;
Eigen::Tensor<float, 3> kernel_3d_;
Eigen::DefaultDevice cpu_device_;
};
// Context for evaluation on GPU
struct GPUContext {
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
float kernel_1d_val[] = {3.14f, 2.7f};
assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
assert(cudaStreamCreate(&stream_) == cudaSuccess);
}
~GPUContext() {
assert(cudaFree(kernel_1d_) == cudaSuccess);
assert(cudaFree(kernel_2d_) == cudaSuccess);
assert(cudaFree(kernel_3d_) == cudaSuccess);
assert(cudaStreamDestroy(stream_) == cudaSuccess);
}
const Eigen::GpuDevice& device() const { return gpu_device_; }
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
private:
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
float* kernel_1d_;
float* kernel_2d_;
float* kernel_3d_;
cudaStream_t stream_;
Eigen::GpuDevice gpu_device_;
};
// The actual expression to evaluate
template <typename Context>
static void test_contextual_eval(Context* context)
{
context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
static void test_forced_contextual_eval(Context* context)
{
context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
static void test_compound_assignment(Context* context)
{
context->out().device(context->device()) = context->in1().constant(2.718f);
context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
}
template <typename Context>
static void test_contraction(Context* context)
{
Eigen::array<std::pair<int, int>, 2> dims;
dims[0] = std::make_pair(1, 1);
dims[1] = std::make_pair(2, 2);
Eigen::array<int, 2> shape(40, 50*70);
Eigen::DSizes<int, 2> indices(0,0);
Eigen::DSizes<int, 2> sizes(40,40);
context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
}
template <typename Context>
static void test_1d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,70);
Eigen::array<int, 1> dims(1);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
}
template <typename Context>
static void test_2d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,69);
Eigen::array<int, 2> dims(1,2);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
}
template <typename Context>
static void test_3d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(39,49,69);
Eigen::array<int, 3> dims(0,1,2);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
}
static void test_cpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
in1 = in1.random() + in1.constant(10.0f);
in2 = in2.random() + in2.constant(10.0f);
CPUContext context(in1, in2, out);
test_contextual_eval(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_forced_contextual_eval(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
}
}
}
test_compound_assignment(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_contraction(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0);
float expected = 0;
for (int k = 0; k < 50; ++k) {
for (int l = 0; l < 70; ++l) {
expected += in1(i, k, l) * in2(j, k, l);
}
}
VERIFY_IS_APPROX(expected, result);
}
}
test_1d_convolution(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
}
}
}
test_2d_convolution(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
(in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
continue;
}
VERIFY_IS_APPROX(expected, result);
}
}
}
test_3d_convolution(&context);
for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
(in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
if (fabs(expected) < 1e-4 && fabs(result) < 1e-4) {
continue;
}
VERIFY_IS_APPROX(expected, result);
}
}
}
}
static void test_gpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
in1 = in1.random() + in1.constant(10.0f);
in2 = in2.random() + in2.constant(10.0f);
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_out;
cudaMalloc((void**)(&d_in1), in1_bytes);
cudaMalloc((void**)(&d_in2), in2_bytes);
cudaMalloc((void**)(&d_out), out_bytes);
cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
GPUContext context(gpu_in1, gpu_in2, gpu_out);
test_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_forced_contextual_eval(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
}
}
}
test_compound_assignment(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_contraction(&context);
assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0);
float expected = 0;
for (int k = 0; k < 50; ++k) {
for (int l = 0; l < 70; ++l) {
expected += in1(i, k, l) * in2(j, k, l);
}
}
VERIFY_IS_APPROX(expected, result);
}
}
test_1d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
}
}
}
test_2d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
VERIFY_IS_APPROX(expected, result);
}
}
}
test_3d_convolution(&context);
assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
VERIFY_IS_APPROX(expected, result);
}
}
}
}
void test_cxx11_tensor_device()
{
CALL_SUBTEST(test_cpu());
CALL_SUBTEST(test_gpu());
}

View File

@ -0,0 +1,54 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
static void test_dynamic_size()
{
Eigen::DSizes<int, 3> dimensions(2,3,7);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
VERIFY_IS_EQUAL((int)dimensions[0], 2);
VERIFY_IS_EQUAL((int)dimensions[1], 3);
VERIFY_IS_EQUAL((int)dimensions[2], 7);
}
static void test_fixed_size()
{
Eigen::Sizes<2,3,7> dimensions;
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
VERIFY_IS_EQUAL(dimensions.TotalSize(), (size_t)2*3*7);
}
static void test_match()
{
Eigen::DSizes<int, 3> dyn(2,3,7);
Eigen::Sizes<2,3,7> stat;
VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
}
void test_cxx11_tensor_dimension()
{
CALL_SUBTEST(test_dynamic_size());
CALL_SUBTEST(test_fixed_size());
CALL_SUBTEST(test_match());
}

View File

@ -0,0 +1,314 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
static void test_1d()
{
Tensor<float, 1> vec1({6});
Tensor<float, 1, RowMajor> vec2({6});
vec1(0) = 4.0; vec2(0) = 0.0;
vec1(1) = 8.0; vec2(1) = 1.0;
vec1(2) = 15.0; vec2(2) = 2.0;
vec1(3) = 16.0; vec2(3) = 3.0;
vec1(4) = 23.0; vec2(4) = 4.0;
vec1(5) = 42.0; vec2(5) = 5.0;
float data3[6];
TensorMap<Tensor<float, 1>> vec3(data3, 6);
vec3 = vec1.sqrt();
float data4[6];
TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
vec4 = vec2.square();
float data5[6];
TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6);
vec5 = vec2.cube();
VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
VERIFY_IS_APPROX(vec4(0), 0.0f);
VERIFY_IS_APPROX(vec4(1), 1.0f);
VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
VERIFY_IS_APPROX(vec5(0), 0.0f);
VERIFY_IS_APPROX(vec5(1), 1.0f);
VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f);
VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f);
VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f);
VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f);
vec3 = vec1 + vec2;
VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
}
static void test_2d()
{
float data1[6];
TensorMap<Tensor<float, 2>> mat1(data1, 2, 3);
float data2[6];
TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3);
mat1(0,0) = 0.0;
mat1(0,1) = 1.0;
mat1(0,2) = 2.0;
mat1(1,0) = 3.0;
mat1(1,1) = 4.0;
mat1(1,2) = 5.0;
mat2(0,0) = -0.0;
mat2(0,1) = -1.0;
mat2(0,2) = -2.0;
mat2(1,0) = -3.0;
mat2(1,1) = -4.0;
mat2(1,2) = -5.0;
Tensor<float, 2> mat3(2,3);
Tensor<float, 2, RowMajor> mat4(2,3);
mat3 = mat1.abs();
mat4 = mat2.abs();
VERIFY_IS_APPROX(mat3(0,0), 0.0f);
VERIFY_IS_APPROX(mat3(0,1), 1.0f);
VERIFY_IS_APPROX(mat3(0,2), 2.0f);
VERIFY_IS_APPROX(mat3(1,0), 3.0f);
VERIFY_IS_APPROX(mat3(1,1), 4.0f);
VERIFY_IS_APPROX(mat3(1,2), 5.0f);
VERIFY_IS_APPROX(mat4(0,0), 0.0f);
VERIFY_IS_APPROX(mat4(0,1), 1.0f);
VERIFY_IS_APPROX(mat4(0,2), 2.0f);
VERIFY_IS_APPROX(mat4(1,0), 3.0f);
VERIFY_IS_APPROX(mat4(1,1), 4.0f);
VERIFY_IS_APPROX(mat4(1,2), 5.0f);
}
static void test_3d()
{
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3, RowMajor> mat2(2,3,7);
float val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
mat2(i,j,k) = val;
val += 1.0;
}
}
}
Tensor<float, 3> mat3(2,3,7);
mat3 = mat1 + mat1;
Tensor<float, 3, RowMajor> mat4(2,3,7);
mat4 = mat2 * 3.14f;
Tensor<float, 3> mat5(2,3,7);
mat5 = mat1.inverse().log();
Tensor<float, 3, RowMajor> mat6(2,3,7);
mat6 = mat2.pow(0.5f) * 3.14f;
Tensor<float, 3> mat7(2,3,7);
mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
Tensor<float, 3, RowMajor> mat8(2,3,7);
mat8 = (-mat2).exp() * 3.14f;
Tensor<float, 3, RowMajor> mat9(2,3,7);
mat9 = mat2 + 3.14f;
Tensor<float, 3, RowMajor> mat10(2,3,7);
mat10 = mat2 - 3.14f;
Tensor<float, 3, RowMajor> mat11(2,3,7);
mat11 = mat2 / 3.14f;
val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat3(i,j,k), val + val);
VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
val += 1.0;
}
}
}
}
static void test_constants()
{
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
Tensor<float, 3> mat3(2,3,7);
float val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
val += 1.0;
}
}
}
mat2 = mat1.constant(3.14f);
mat3 = mat1.cwiseMax(7.3f).exp();
val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
val += 1.0;
}
}
}
}
static void test_boolean()
{
Tensor<int, 1> vec(6);
std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
// Test ||.
Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
VERIFY_IS_EQUAL(bool1[0], true);
VERIFY_IS_EQUAL(bool1[1], false);
VERIFY_IS_EQUAL(bool1[2], false);
VERIFY_IS_EQUAL(bool1[3], false);
VERIFY_IS_EQUAL(bool1[4], false);
VERIFY_IS_EQUAL(bool1[5], true);
// Test &&, including cast of operand vec.
Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
VERIFY_IS_EQUAL(bool2[0], false);
VERIFY_IS_EQUAL(bool2[1], true);
VERIFY_IS_EQUAL(bool2[2], true);
VERIFY_IS_EQUAL(bool2[3], true);
VERIFY_IS_EQUAL(bool2[4], false);
VERIFY_IS_EQUAL(bool2[5], false);
// Compilation tests:
// Test Tensor<bool> against results of cast or comparison; verifies that
// CoeffReturnType is set to match Op return type of bool for Unary and Binary
// Ops.
Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
bool3 = vec < vec.constant(4) && bool2;
}
static void test_functors()
{
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
Tensor<float, 3> mat3(2,3,7);
float val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
val += 1.0;
}
}
}
mat2 = mat1.inverse().unaryExpr(&asinf);
mat3 = mat1.unaryExpr(&tanhf);
val = 1.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
val += 1.0;
}
}
}
}
static void test_type_casting()
{
Tensor<bool, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
Tensor<double, 3> mat3(2,3,7);
mat1.setRandom();
mat2.setRandom();
mat3 = mat1.template cast<double>();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
}
}
}
mat3 = mat2.template cast<double>();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
}
}
}
}
static void test_select()
{
Tensor<float, 3> selector(2,3,7);
Tensor<float, 3> mat1(2,3,7);
Tensor<float, 3> mat2(2,3,7);
Tensor<float, 3> result(2,3,7);
selector.setRandom();
mat1.setRandom();
mat2.setRandom();
result = (selector > selector.constant(0.5f)).select(mat1, mat2);
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
}
}
}
}
void test_cxx11_tensor_expr()
{
CALL_SUBTEST(test_1d());
CALL_SUBTEST(test_2d());
CALL_SUBTEST(test_3d());
CALL_SUBTEST(test_constants());
CALL_SUBTEST(test_boolean());
CALL_SUBTEST(test_functors());
CALL_SUBTEST(test_type_casting());
CALL_SUBTEST(test_select());
}

View File

@ -0,0 +1,198 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
static void test_1d()
{
TensorFixedSize<float, Sizes<6> > vec1;
TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
VERIFY_IS_EQUAL((vec1.size()), 6);
// VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
// VERIFY_IS_EQUAL((vec1.dimension(0)), 6);
vec1(0) = 4.0; vec2(0) = 0.0;
vec1(1) = 8.0; vec2(1) = 1.0;
vec1(2) = 15.0; vec2(2) = 2.0;
vec1(3) = 16.0; vec2(3) = 3.0;
vec1(4) = 23.0; vec2(4) = 4.0;
vec1(5) = 42.0; vec2(5) = 5.0;
float data3[6];
TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
vec3 = vec1.sqrt();
float data4[6];
TensorMap<TensorFixedSize<float, Sizes<6>, RowMajor> > vec4(data4, 6);
vec4 = vec2.sqrt();
VERIFY_IS_EQUAL((vec3.size()), 6);
VERIFY_IS_EQUAL(vec3.rank(), 1);
// VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
// VERIFY_IS_EQUAL((vec3.dimension(0)), 6);
VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
vec3 = vec1 + vec2;
VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
}
static void test_2d()
{
float data1[6];
TensorMap<TensorFixedSize<float, Sizes<2, 3> >> mat1(data1,2,3);
float data2[6];
TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor>> mat2(data2,2,3);
VERIFY_IS_EQUAL((mat1.size()), 2*3);
VERIFY_IS_EQUAL(mat1.rank(), 2);
// VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
// VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
mat1(0,0) = 0.0;
mat1(0,1) = 1.0;
mat1(0,2) = 2.0;
mat1(1,0) = 3.0;
mat1(1,1) = 4.0;
mat1(1,2) = 5.0;
mat2(0,0) = -0.0;
mat2(0,1) = -1.0;
mat2(0,2) = -2.0;
mat2(1,0) = -3.0;
mat2(1,1) = -4.0;
mat2(1,2) = -5.0;
TensorFixedSize<float, Sizes<2, 3>> mat3;
TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
mat3 = mat1.abs();
mat4 = mat2.abs();
VERIFY_IS_EQUAL((mat3.size()), 2*3);
// VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
// VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
VERIFY_IS_APPROX(mat3(0,0), 0.0f);
VERIFY_IS_APPROX(mat3(0,1), 1.0f);
VERIFY_IS_APPROX(mat3(0,2), 2.0f);
VERIFY_IS_APPROX(mat3(1,0), 3.0f);
VERIFY_IS_APPROX(mat3(1,1), 4.0f);
VERIFY_IS_APPROX(mat3(1,2), 5.0f);
VERIFY_IS_APPROX(mat4(0,0), 0.0f);
VERIFY_IS_APPROX(mat4(0,1), 1.0f);
VERIFY_IS_APPROX(mat4(0,2), 2.0f);
VERIFY_IS_APPROX(mat4(1,0), 3.0f);
VERIFY_IS_APPROX(mat4(1,1), 4.0f);
VERIFY_IS_APPROX(mat4(1,2), 5.0f);
}
static void test_3d()
{
TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2;
VERIFY_IS_EQUAL((mat1.size()), 2*3*7);
VERIFY_IS_EQUAL(mat1.rank(), 3);
// VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
// VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
// VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
float val = 0.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
mat2(i,j,k) = val;
val += 1.0;
}
}
}
TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
mat3 = mat1.sqrt();
TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
mat4 = mat2.sqrt();
VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
// VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
// VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
// VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
val = 0.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
val += 1.0;
}
}
}
}
static void test_array()
{
TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
float val = 0.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
mat1(i,j,k) = val;
val += 1.0;
}
}
}
TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
mat3 = mat1.pow(3.5f);
val = 0.0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
val += 1.0;
}
}
}
}
void test_cxx11_tensor_fixed_size()
{
CALL_SUBTEST(test_1d());
CALL_SUBTEST(test_2d());
CALL_SUBTEST(test_3d());
CALL_SUBTEST(test_array());
}

View File

@ -0,0 +1,78 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/Core>
#include <Eigen/CXX11/Tensor>
using Eigen::MatrixXf;
using Eigen::Tensor;
static void test_simple()
{
MatrixXf m1(3,3);
MatrixXf m2(3,3);
m1.setRandom();
m2.setRandom();
TensorMap<Tensor<float, 2>> mat1(m1.data(), 3,3);
TensorMap<Tensor<float, 2>> mat2(m2.data(), 3,3);
Tensor<float, 2> mat3(3,3);
mat3 = mat1;
typedef Tensor<float, 1>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
mat3 = mat3.contract(mat2, dims).eval();
VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0));
VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1));
VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2));
VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0));
VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1));
VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2));
VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0));
VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1));
VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2));
}
static void test_const()
{
MatrixXf input(3,3);
input.setRandom();
MatrixXf output = input;
output.rowwise() -= input.colwise().maxCoeff();
Eigen::array<int, 1> depth_dim;
depth_dim[0] = 0;
Tensor<float, 2>::Dimensions dims2d;
dims2d[0] = 1;
dims2d[1] = 3;
Eigen::array<int, 2> bcast;
bcast[0] = 3;
bcast[1] = 1;
const TensorMap<Tensor<const float, 2>> input_tensor(input.data(), 3, 3);
Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
}
}
}
void test_cxx11_tensor_forced_eval()
{
CALL_SUBTEST(test_simple());
CALL_SUBTEST(test_const());
}

View File

@ -0,0 +1,476 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
using Eigen::Tensor;
static void test_simple_patch()
{
Tensor<float, 4> tensor(2,3,5,7);
tensor.setRandom();
Tensor<float, 5> single_pixel_patch;
single_pixel_patch = tensor.extract_image_patches<1, 1>();
VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
for (int i = 0; i < tensor.size(); ++i) {
if (tensor.data()[i] != single_pixel_patch.data()[i]) {
std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
}
VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
}
Tensor<float, 5> entire_image_patch;
entire_image_patch = tensor.extract_image_patches<3, 5>();
VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
int patchId = i+3*j;
for (int r = 0; r < 3; ++r) {
for (int c = 0; c < 5; ++c) {
for (int d = 0; d < 2; ++d) {
for (int b = 0; b < 7; ++b) {
float expected = 0.0f;
if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
expected = tensor(d, r-1+i, c-2+j, b);
}
if (entire_image_patch(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
}
}
}
}
}
}
Tensor<float, 5> twod_patch;
twod_patch = tensor.extract_image_patches<2, 2>();
VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
// Based on the calculation described in TensorTraits.h, padding happens to be 0.
int row_padding = 0;
int col_padding = 0;
int stride = 1;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
int patchId = i+3*j;
for (int r = 0; r < 2; ++r) {
for (int c = 0; c < 2; ++c) {
for (int d = 0; d < 2; ++d) {
for (int b = 0; b < 7; ++b) {
float expected = 0.0f;
int row_offset = r*stride + i - row_padding;
int col_offset = c*stride + j - col_padding;
if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
expected = tensor(d, row_offset, col_offset, b);
}
if (twod_patch(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
}
}
}
}
}
}
}
// Verifies VALID padding (no padding) with incrementing values.
static void test_patch_padding_valid()
{
int input_depth = 3;
int input_rows = 3;
int input_cols = 3;
int input_batches = 1;
int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
int stride = 2; // Only same stride is supported.
Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
// Initializes tensor with incrementing numbers.
for (int i = 0; i < tensor.size(); ++i) {
tensor.data()[i] = i + 1;
}
Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth
VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows
VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols
VERIFY_IS_EQUAL(result.dimension(3), 1); // number of patches
VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches
// No padding is carried out.
int row_padding = 0;
int col_padding = 0;
for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows
for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols
int patchId = i+input_rows*j;
for (int r = 0; r < ksize; ++r) { // patch rows
for (int c = 0; c < ksize; ++c) { // patch cols
for (int d = 0; d < input_depth; ++d) { // depth
for (int b = 0; b < input_batches; ++b) { // batch
float expected = 0.0f;
int row_offset = r + i - row_padding;
int col_offset = c + j - col_padding;
if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
expected = tensor(d, row_offset, col_offset, b);
}
if (result(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
}
}
}
}
}
}
}
// Verifies VALID padding (no padding) with the same value.
static void test_patch_padding_valid_same_value()
{
int input_depth = 1;
int input_rows = 5;
int input_cols = 5;
int input_batches = 2;
int ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
int stride = 2; // Only same stride is supported.
Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
tensor = tensor.constant(11.0f);
Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_VALID);
VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth
VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows
VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols
VERIFY_IS_EQUAL(result.dimension(3), 4); // number of patches
VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches
// No padding is carried out.
int row_padding = 0;
int col_padding = 0;
for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
int patchId = i+input_rows*j;
for (int r = 0; r < ksize; ++r) { // patch rows
for (int c = 0; c < ksize; ++c) { // patch cols
for (int d = 0; d < input_depth; ++d) { // depth
for (int b = 0; b < input_batches; ++b) { // batch
float expected = 0.0f;
int row_offset = r + i - row_padding;
int col_offset = c + j - col_padding;
if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
expected = tensor(d, row_offset, col_offset, b);
}
if (result(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
}
}
}
}
}
}
}
// Verifies SAME padding.
static void test_patch_padding_same()
{
int input_depth = 3;
int input_rows = 4;
int input_cols = 2;
int input_batches = 1;
int ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
int stride = 2; // Only same stride is supported.
Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
// Initializes tensor with incrementing numbers.
for (int i = 0; i < tensor.size(); ++i) {
tensor.data()[i] = i + 1;
}
Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth
VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows
VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols
VERIFY_IS_EQUAL(result.dimension(3), 2); // number of patches
VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches
// Based on the calculation described in TensorTraits.h, padding happens to be
// 0.
int row_padding = 0;
int col_padding = 0;
for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows
for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols
int patchId = i+input_rows*j;
for (int r = 0; r < ksize; ++r) { // patch rows
for (int c = 0; c < ksize; ++c) { // patch cols
for (int d = 0; d < input_depth; ++d) { // depth
for (int b = 0; b < input_batches; ++b) { // batch
float expected = 0.0f;
int row_offset = r*stride + i - row_padding;
int col_offset = c*stride + j - col_padding;
if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
expected = tensor(d, row_offset, col_offset, b);
}
if (result(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
}
}
}
}
}
}
}
static void test_patch_no_extra_dim()
{
Tensor<float, 3> tensor(2,3,5);
tensor.setRandom();
Tensor<float, 4> single_pixel_patch;
single_pixel_patch = tensor.extract_image_patches<1, 1>();
VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
for (int i = 0; i < tensor.size(); ++i) {
if (tensor.data()[i] != single_pixel_patch.data()[i]) {
std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
}
VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
}
Tensor<float, 4> entire_image_patch;
entire_image_patch = tensor.extract_image_patches<3, 5>();
VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
int patchId = i+3*j;
for (int r = 0; r < 3; ++r) {
for (int c = 0; c < 5; ++c) {
for (int d = 0; d < 2; ++d) {
float expected = 0.0f;
if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
expected = tensor(d, r-1+i, c-2+j);
}
if (entire_image_patch(d, r, c, patchId) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
}
VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
}
}
}
}
}
Tensor<float, 4> twod_patch;
twod_patch = tensor.extract_image_patches<2, 2>();
VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
// Based on the calculation described in TensorTraits.h, padding happens to be 0.
int row_padding = 0;
int col_padding = 0;
int stride = 1;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 5; ++j) {
int patchId = i+3*j;
for (int r = 0; r < 2; ++r) {
for (int c = 0; c < 2; ++c) {
for (int d = 0; d < 2; ++d) {
float expected = 0.0f;
int row_offset = r*stride + i - row_padding;
int col_offset = c*stride + j - col_padding;
if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
expected = tensor(d, row_offset, col_offset);
}
if (twod_patch(d, r, c, patchId) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
}
VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
}
}
}
}
}
}
static void test_imagenet_patches()
{
// Test the code on typical configurations used by the 'imagenet' benchmarks at
// https://github.com/soumith/convnet-benchmarks
Tensor<float, 4> l_in(3, 128, 128, 128);
l_in.setRandom();
Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11);
VERIFY_IS_EQUAL(l_out.dimension(0), 3);
VERIFY_IS_EQUAL(l_out.dimension(1), 11);
VERIFY_IS_EQUAL(l_out.dimension(2), 11);
VERIFY_IS_EQUAL(l_out.dimension(3), 128*128);
VERIFY_IS_EQUAL(l_out.dimension(4), 128);
for (int b = 0; b < 128; ++b) {
for (int i = 0; i < 128; ++i) {
for (int j = 0; j < 128; ++j) {
int patchId = i+128*j;
for (int c = 0; c < 11; ++c) {
for (int r = 0; r < 11; ++r) {
for (int d = 0; d < 3; ++d) {
float expected = 0.0f;
if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
expected = l_in(d, r-5+i, c-5+j, b);
}
if (l_out(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
}
}
}
}
}
}
l_in.resize(64, 64, 64, 128);
l_in.setRandom();
l_out = l_in.extract_image_patches(9, 9);
VERIFY_IS_EQUAL(l_out.dimension(0), 64);
VERIFY_IS_EQUAL(l_out.dimension(1), 9);
VERIFY_IS_EQUAL(l_out.dimension(2), 9);
VERIFY_IS_EQUAL(l_out.dimension(3), 64*64);
VERIFY_IS_EQUAL(l_out.dimension(4), 128);
for (int b = 0; b < 128; ++b) {
for (int i = 0; i < 64; ++i) {
for (int j = 0; j < 64; ++j) {
int patchId = i+64*j;
for (int c = 0; c < 9; ++c) {
for (int r = 0; r < 9; ++r) {
for (int d = 0; d < 64; ++d) {
float expected = 0.0f;
if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
expected = l_in(d, r-4+i, c-4+j, b);
}
if (l_out(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
}
}
}
}
}
}
l_in.resize(128, 16, 16, 128);
l_in.setRandom();
l_out = l_in.extract_image_patches(7, 7);
VERIFY_IS_EQUAL(l_out.dimension(0), 128);
VERIFY_IS_EQUAL(l_out.dimension(1), 7);
VERIFY_IS_EQUAL(l_out.dimension(2), 7);
VERIFY_IS_EQUAL(l_out.dimension(3), 16*16);
VERIFY_IS_EQUAL(l_out.dimension(4), 128);
for (int b = 0; b < 128; ++b) {
for (int i = 0; i < 16; ++i) {
for (int j = 0; j < 16; ++j) {
int patchId = i+16*j;
for (int c = 0; c < 7; ++c) {
for (int r = 0; r < 7; ++r) {
for (int d = 0; d < 128; ++d) {
float expected = 0.0f;
if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
expected = l_in(d, r-3+i, c-3+j, b);
}
if (l_out(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
}
}
}
}
}
}
l_in.resize(384, 13, 13, 128);
l_in.setRandom();
l_out = l_in.extract_image_patches(3, 3);
VERIFY_IS_EQUAL(l_out.dimension(0), 384);
VERIFY_IS_EQUAL(l_out.dimension(1), 3);
VERIFY_IS_EQUAL(l_out.dimension(2), 3);
VERIFY_IS_EQUAL(l_out.dimension(3), 13*13);
VERIFY_IS_EQUAL(l_out.dimension(4), 128);
for (int b = 0; b < 128; ++b) {
for (int i = 0; i < 13; ++i) {
for (int j = 0; j < 13; ++j) {
int patchId = i+13*j;
for (int c = 0; c < 3; ++c) {
for (int r = 0; r < 3; ++r) {
for (int d = 0; d < 384; ++d) {
float expected = 0.0f;
if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
expected = l_in(d, r-1+i, c-1+j, b);
}
if (l_out(d, r, c, patchId, b) != expected) {
std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
}
VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
}
}
}
}
}
}
}
void test_cxx11_tensor_image_patch()
{
CALL_SUBTEST(test_simple_patch());
CALL_SUBTEST(test_patch_no_extra_dim());
CALL_SUBTEST(test_patch_padding_valid());
CALL_SUBTEST(test_patch_padding_valid_same_value());
CALL_SUBTEST(test_patch_padding_same());
CALL_SUBTEST(test_imagenet_patches());
}

View File

@ -0,0 +1,268 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "main.h"
#include <Eigen/CXX11/Tensor>
#ifdef EIGEN_HAS_CONSTEXPR
static void test_static_index_list()
{
Tensor<float, 4> tensor(2,3,5,7);
tensor.setRandom();
constexpr auto reduction_axis = make_index_list(0, 1, 2);
VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
Tensor<float, 1> result = tensor.sum(reduction_axis);
for (int i = 0; i < result.size(); ++i) {
float expected = 0.0f;
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 3; ++k) {
for (int l = 0; l < 5; ++l) {
expected += tensor(j,k,l,i);
}
}
}
VERIFY_IS_APPROX(result(i), expected);
}
}
static void test_type2index_list()
{
Tensor<float, 5> tensor(2,3,5,7,11);
tensor.setRandom();
tensor += tensor.constant(10.0f);
typedef Eigen::IndexList<Eigen::type2index<0>> Dims0;
typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1;
typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2;
typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3;
typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
#if 0
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
#endif
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
const Dims0 reduction_axis0;
Tensor<float, 4> result0 = tensor.sum(reduction_axis0);
for (int m = 0; m < 11; ++m) {
for (int l = 0; l < 7; ++l) {
for (int k = 0; k < 5; ++k) {
for (int j = 0; j < 3; ++j) {
float expected = 0.0f;
for (int i = 0; i < 2; ++i) {
expected += tensor(i,j,k,l,m);
}
VERIFY_IS_APPROX(result0(j,k,l,m), expected);
}
}
}
}
const Dims1 reduction_axis1;
Tensor<float, 3> result1 = tensor.sum(reduction_axis1);
for (int m = 0; m < 11; ++m) {
for (int l = 0; l < 7; ++l) {
for (int k = 0; k < 5; ++k) {
float expected = 0.0f;
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
expected += tensor(i,j,k,l,m);
}
}
VERIFY_IS_APPROX(result1(k,l,m), expected);
}
}
}
const Dims2 reduction_axis2;
Tensor<float, 2> result2 = tensor.sum(reduction_axis2);
for (int m = 0; m < 11; ++m) {
for (int l = 0; l < 7; ++l) {
float expected = 0.0f;
for (int k = 0; k < 5; ++k) {
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
expected += tensor(i,j,k,l,m);
}
}
}
VERIFY_IS_APPROX(result2(l,m), expected);
}
}
const Dims3 reduction_axis3;
Tensor<float, 1> result3 = tensor.sum(reduction_axis3);
for (int m = 0; m < 11; ++m) {
float expected = 0.0f;
for (int l = 0; l < 7; ++l) {
for (int k = 0; k < 5; ++k) {
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
expected += tensor(i,j,k,l,m);
}
}
}
}
VERIFY_IS_APPROX(result3(m), expected);
}
const Dims4 reduction_axis4;
Tensor<float, 1> result4 = tensor.sum(reduction_axis4);
float expected = 0.0f;
for (int m = 0; m < 11; ++m) {
for (int l = 0; l < 7; ++l) {
for (int k = 0; k < 5; ++k) {
for (int j = 0; j < 3; ++j) {
for (int i = 0; i < 2; ++i) {
expected += tensor(i,j,k,l,m);
}
}
}
}
}
VERIFY_IS_APPROX(result4(0), expected);
}
static void test_dynamic_index_list()
{
Tensor<float, 4> tensor(2,3,5,7);
tensor.setRandom();
int dim1 = 2;
int dim2 = 1;
int dim3 = 0;
auto reduction_axis = make_index_list(dim1, dim2, dim3);
VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
Tensor<float, 1> result = tensor.sum(reduction_axis);
for (int i = 0; i < result.size(); ++i) {
float expected = 0.0f;
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 3; ++k) {
for (int l = 0; l < 5; ++l) {
expected += tensor(j,k,l,i);
}
}
}
VERIFY_IS_APPROX(result(i), expected);
}
}
static void test_mixed_index_list()
{
Tensor<float, 4> tensor(2,3,5,7);
tensor.setRandom();
int dim2 = 1;
int dim4 = 3;
auto reduction_axis = make_index_list(0, dim2, 2, dim4);
VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
ReductionIndices reduction_indices;
reduction_indices.set(1, 1);
reduction_indices.set(3, 3);
EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>()(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
#if 0
EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>()() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
#endif
typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
ReductionList reduction_list;
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>()(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
#if 0
EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>()() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
#endif
Tensor<float, 1> result1 = tensor.sum(reduction_axis);
Tensor<float, 1> result2 = tensor.sum(reduction_indices);
Tensor<float, 1> result3 = tensor.sum(reduction_list);
float expected = 0.0f;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
expected += tensor(i,j,k,l);
}
}
}
}
VERIFY_IS_APPROX(result1(0), expected);
VERIFY_IS_APPROX(result2(0), expected);
VERIFY_IS_APPROX(result3(0), expected);
}
#endif
void test_cxx11_tensor_index_list()
{
#ifdef EIGEN_HAS_CONSTEXPR
CALL_SUBTEST(test_static_index_list());
CALL_SUBTEST(test_type2index_list());
CALL_SUBTEST(test_dynamic_index_list());
CALL_SUBTEST(test_mixed_index_list());
#endif
}

Some files were not shown because too many files have changed in this diff Show More