Merged eigen/eigen into default

This commit is contained in:
Benoit Steiner 2016-11-04 18:22:55 -07:00
commit d46a36cc84
23 changed files with 2404 additions and 66 deletions

View File

@ -133,7 +133,6 @@ if(NOT MSVC)
if(COMPILER_SUPPORT_WERROR)
set(CMAKE_REQUIRED_FLAGS "-Werror")
endif()
ei_add_cxx_compiler_flag("-pedantic")
ei_add_cxx_compiler_flag("-Wall")
ei_add_cxx_compiler_flag("-Wextra")
@ -231,6 +230,12 @@ if(NOT MSVC)
message(STATUS "Enabling FMA in tests/examples")
endif()
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
if(EIGEN_TEST_AVX512)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
message(STATUS "Enabling AVX512 in tests/examples")
endif()
option(EIGEN_TEST_F16C "Enable/Disable F16C in tests/examples" OFF)
if(EIGEN_TEST_F16C)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mf16c")

View File

@ -147,6 +147,15 @@
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
#if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
#define EIGEN_VECTORIZE_AVX512
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_FMA
#ifdef __AVX512DQ__
#define EIGEN_VECTORIZE_AVX512DQ
#endif
#endif
// include files
@ -178,7 +187,7 @@
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_AVX
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
#include <immintrin.h>
#endif
#endif
@ -278,7 +287,9 @@
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_AVX)
#if defined(EIGEN_VECTORIZE_AVX512)
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
@ -338,7 +349,12 @@ using std::ptrdiff_t;
#include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h"
#if defined EIGEN_VECTORIZE_AVX
#if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX512/PacketMath.h"
#include "src/Core/arch/AVX512/MathFunctions.h"
#elif defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/Complex.h"

View File

@ -14,34 +14,40 @@ namespace Eigen {
namespace internal {
template<typename Scalar, typename CholmodType>
void cholmod_configure_matrix(CholmodType& mat)
{
if (internal::is_same<Scalar,float>::value)
{
mat.xtype = CHOLMOD_REAL;
mat.dtype = CHOLMOD_SINGLE;
}
else if (internal::is_same<Scalar,double>::value)
{
template<typename Scalar> struct cholmod_configure_matrix;
template<> struct cholmod_configure_matrix<double> {
template<typename CholmodType>
static void run(CholmodType& mat) {
mat.xtype = CHOLMOD_REAL;
mat.dtype = CHOLMOD_DOUBLE;
}
else if (internal::is_same<Scalar,std::complex<float> >::value)
{
mat.xtype = CHOLMOD_COMPLEX;
mat.dtype = CHOLMOD_SINGLE;
}
else if (internal::is_same<Scalar,std::complex<double> >::value)
{
};
template<> struct cholmod_configure_matrix<std::complex<double> > {
template<typename CholmodType>
static void run(CholmodType& mat) {
mat.xtype = CHOLMOD_COMPLEX;
mat.dtype = CHOLMOD_DOUBLE;
}
else
{
eigen_assert(false && "Scalar type not supported by CHOLMOD");
}
}
};
// Other scalar types are not yet suppotred by Cholmod
// template<> struct cholmod_configure_matrix<float> {
// template<typename CholmodType>
// static void run(CholmodType& mat) {
// mat.xtype = CHOLMOD_REAL;
// mat.dtype = CHOLMOD_SINGLE;
// }
// };
//
// template<> struct cholmod_configure_matrix<std::complex<float> > {
// template<typename CholmodType>
// static void run(CholmodType& mat) {
// mat.xtype = CHOLMOD_COMPLEX;
// mat.dtype = CHOLMOD_SINGLE;
// }
// };
} // namespace internal
@ -88,7 +94,7 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
}
// setup res.xtype
internal::cholmod_configure_matrix<_Scalar>(res);
internal::cholmod_configure_matrix<_Scalar>::run(res);
res.stype = 0;
@ -131,7 +137,7 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
res.x = (void*)(mat.derived().data());
res.z = 0;
internal::cholmod_configure_matrix<Scalar>(res);
internal::cholmod_configure_matrix<Scalar>::run(res);
return res;
}
@ -180,14 +186,16 @@ class CholmodBase : public SparseSolverBase<Derived>
CholmodBase()
: m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
{
m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
cholmod_start(&m_cholmod);
}
explicit CholmodBase(const MatrixType& matrix)
: m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false)
{
m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
cholmod_start(&m_cholmod);
compute(matrix);
}
@ -254,7 +262,7 @@ class CholmodBase : public SparseSolverBase<Derived>
eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
// If the factorization failed, minor is the column at which it did. On success minor == n.
this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
m_factorizationIsOk = true;
@ -324,7 +332,7 @@ class CholmodBase : public SparseSolverBase<Derived>
*/
Derived& setShift(const RealScalar& offset)
{
m_shiftOffset[0] = offset;
m_shiftOffset[0] = double(offset);
return derived();
}
@ -386,7 +394,7 @@ class CholmodBase : public SparseSolverBase<Derived>
protected:
mutable cholmod_common m_cholmod;
cholmod_factor* m_cholmodFactor;
RealScalar m_shiftOffset[2];
double m_shiftOffset[2];
mutable ComputationInfo m_info;
int m_factorizationIsOk;
int m_analysisIsOk;
@ -410,6 +418,8 @@ class CholmodBase : public SparseSolverBase<Derived>
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
*/
template<typename _MatrixType, int _UpLo = Lower>
@ -459,6 +469,8 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
*/
template<typename _MatrixType, int _UpLo = Lower>
@ -506,6 +518,8 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept
*/
template<typename _MatrixType, int _UpLo = Lower>
@ -555,6 +569,8 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
*
* This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
*
* \warning Only double precision real and complex scalar types are supported by Cholmod.
*
* \sa \ref TutorialSparseSolverConcept
*/
template<typename _MatrixType, int _UpLo = Lower>

View File

@ -329,7 +329,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Pack
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux4(const Packet& a)
predux_downto4(const Packet& a)
{ return a; }
/** \internal \returns the product of the elements of \a a*/
@ -558,7 +558,21 @@ pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& th
return ifPacket.select[0] ? thenPacket : elsePacket;
}
/** \internal \returns \a a with last coefficients replaced by the scalar b */
/** \internal \returns \a a with the first coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
{
// Default implementation based on pblend.
// It must be specialized for higher performance.
Selector<unpacket_traits<Packet>::size> mask;
mask.select[0] = true;
// This for loop should be optimized away by the compiler.
for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
mask.select[i] = false;
return pblend(mask, pset1<Packet>(b), a);
}
/** \internal \returns \a a with the last coefficient replaced by the scalar b */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
{

View File

@ -602,7 +602,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
return m_matrix / extendedTo(other.derived());
}
/** \returns an expression where each column of row of the referenced matrix are normalized.
/** \returns an expression where each column (or row) of the referenced matrix are normalized.
* The referenced matrix is \b not modified.
* \sa MatrixBase::normalized(), normalize()
*/

View File

@ -456,6 +456,26 @@ ptranspose(PacketBlock<Packet2cd,2>& kernel) {
kernel.packet[0].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
}
template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
{
return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
}
template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
{
return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
}
} // end namespace internal
} // end namespace Eigen

View File

@ -48,7 +48,9 @@ template<> struct is_arithmetic<__m256d> { enum { value = true }; };
#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
const Packet8i p8i_##NAME = pset1<Packet8i>(X)
// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
// to leverage AVX512 instructions.
#ifndef EIGEN_VECTORIZE_AVX512
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet8f type;
@ -93,6 +95,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasCeil = 1
};
};
#endif
template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
@ -304,9 +307,11 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
pstore(to, pa);
}
#ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
@ -400,7 +405,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
}
template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}
@ -604,6 +609,16 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
{
return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
}
template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
{
return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));

View File

@ -0,0 +1,396 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
namespace Eigen {
namespace internal {
// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
#if EIGEN_GNUC_AT_LEAST(5, 3)
#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
const Packet16f p16f_##NAME = pset1<Packet16f>(X)
#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
const Packet8d p8d_##NAME = pset1<Packet8d>(X)
#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
// Natural logarithm
// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
// be easily approximated by a polynomial centered on m=1 for stability.
#if defined(EIGEN_VECTORIZE_AVX512DQ)
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
plog<Packet16f>(const Packet16f& _x) {
Packet16f x = _x;
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000);
// The smallest non denormalized float number.
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
// Polynomial coefficients.
_EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
// invalid_mask is set to true when x is NaN
__mmask16 invalid_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
__mmask16 iszero_mask =
_mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
// Truncate input values to the minimum positive normal.
x = pmax(x, p16f_min_norm_pos);
// Extract the shifted exponents.
Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
// Set the exponents to -1, i.e. x are in the range [0.5,1).
x = _mm512_and_ps(x, p16f_inv_mant_mask);
x = _mm512_or_ps(x, p16f_half);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
// if( x < SQRTHF ) {
// e -= 1;
// x = x + x - 1.0;
// } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
x = padd(x, tmp);
Packet16f x2 = pmul(x, x);
Packet16f x3 = pmul(x2, x);
// Evaluate the polynomial approximant of degree 8 in three parts, probably
// to improve instruction-level parallelism.
Packet16f y, y1, y2;
y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1);
y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4);
y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7);
y = pmadd(y, x, p16f_cephes_log_p2);
y1 = pmadd(y1, x, p16f_cephes_log_p5);
y2 = pmadd(y2, x, p16f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
// Add the logarithm of the exponent back to the result of the interpolation.
y1 = pmul(e, p16f_cephes_log_q1);
tmp = pmul(x2, p16f_half);
y = padd(y, y1);
x = psub(x, tmp);
y2 = pmul(e, p16f_cephes_log_q2);
x = padd(x, y);
x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
}
#endif
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
pexp<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(1, 1.0f);
_EIGEN_DECLARE_CONST_Packet16f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet16f(127, 127.0f);
_EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f);
// Clamp x.
Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo);
// Express exp(x) as exp(m*ln(2) + r), start by extracting
// m = floor(x/ln(2) + 0.5).
Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half));
// Get r = x - m*ln(2). Note that we can do this without losing more than one
// ulp precision due to the FMA instruction.
_EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f);
Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x);
Packet16f r2 = pmul(r, r);
// TODO(gonnet): Split into odd/even polynomials and try to exploit
// instruction-level parallelism.
Packet16f y = p16f_cephes_exp_p0;
y = pmadd(y, r, p16f_cephes_exp_p1);
y = pmadd(y, r, p16f_cephes_exp_p2);
y = pmadd(y, r, p16f_cephes_exp_p3);
y = pmadd(y, r, p16f_cephes_exp_p4);
y = pmadd(y, r, p16f_cephes_exp_p5);
y = pmadd(y, r2, r);
y = padd(y, p16f_1);
// Build emm0 = 2^m.
Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127));
emm0 = _mm512_slli_epi32(emm0, 23);
// Return 2^m * exp(r).
return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
}
/*template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
pexp<Packet8d>(const Packet8d& _x) {
Packet8d x = _x;
_EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
_EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
_EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
_EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
_EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
_EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
// clamp x
x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
// Express exp(x) as exp(g + n*log(2)).
const Packet8d n =
_mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
// Get the remainder modulo log(2), i.e. the "g" described above. Subtract
// n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
// digits right.
const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
x = psub(x, nC1);
x = psub(x, nC2);
const Packet8d x2 = pmul(x, x);
// Evaluate the numerator polynomial of the rational interpolant.
Packet8d px = p8d_cephes_exp_p0;
px = pmadd(px, x2, p8d_cephes_exp_p1);
px = pmadd(px, x2, p8d_cephes_exp_p2);
px = pmul(px, x);
// Evaluate the denominator polynomial of the rational interpolant.
Packet8d qx = p8d_cephes_exp_q0;
qx = pmadd(qx, x2, p8d_cephes_exp_q1);
qx = pmadd(qx, x2, p8d_cephes_exp_q2);
qx = pmadd(qx, x2, p8d_cephes_exp_q3);
// I don't really get this bit, copied from the SSE2 routines, so...
// TODO(gonnet): Figure out what is going on here, perhaps find a better
// rational interpolant?
x = _mm512_div_pd(px, psub(qx, px));
x = pmadd(p8d_2, x, p8d_1);
// Build e=2^n.
const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
_mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
return pmax(pmul(x, e), _x);
}*/
// Functions for sqrt.
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
// exact solution. The main advantage of this approach is not just speed, but
// also the fact that it can be inlined and pipelined with other computations,
// further reducing its effective latency.
#if EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
_mm512_setzero_ps());
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
_mm512_setzero_pd());
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
}
#else
template <>
EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
return _mm512_sqrt_ps(x);
}
template <>
EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
return _mm512_sqrt_pd(x);
}
#endif
// Functions for rsqrt.
// Almost identical to the sqrt routine, just leave out the last multiplication
// and fill in NaN/Inf where needed. Note that this function only exists as an
// iterative version for doubles since there is no instruction for diretly
// computing the reciprocal square root in AVX-512.
#ifdef EIGEN_FAST_MATH
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
prsqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
_mm512_rsqrt14_ps(_x));
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, p16f_nan,
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
prsqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, p8d_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
_mm512_rsqrt14_pd(_x));
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, p8d_nan,
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
}
#else
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);
}
#endif
#endif
} // end namespace internal
} // end namespace Eigen
#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_

File diff suppressed because it is too large Load Diff

View File

@ -333,6 +333,374 @@ template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
#endif
#elif defined EIGEN_VECTORIZE_AVX512
typedef struct {
__m256i x;
} Packet16h;
template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
template <>
struct packet_traits<half> : default_packet_traits {
typedef Packet16h type;
// There is no half-size packet for Packet16h.
typedef Packet16h half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
HasHalfPacket = 0,
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
HasDiv = 0,
HasSqrt = 0,
HasRsqrt = 0,
HasExp = 0,
HasLog = 0,
HasBlend = 0
};
};
template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
Packet16h result;
result.x = _mm256_set1_epi16(from.x);
return result;
}
template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
}
template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
Packet16h result;
result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
return result;
}
template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
Packet16h result;
result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
return result;
}
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
_mm256_store_si256((__m256i*)to, from.x);
}
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
_mm256_storeu_si256((__m256i*)to, from.x);
}
template<> EIGEN_STRONG_INLINE Packet16h
ploadquad(const Eigen::half* from) {
Packet16h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
return result;
}
EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
#ifdef EIGEN_HAS_FP16_C
return _mm512_cvtph_ps(a.x);
#else
EIGEN_ALIGN64 half aux[16];
pstore(aux, a);
float f0(aux[0]);
float f1(aux[1]);
float f2(aux[2]);
float f3(aux[3]);
float f4(aux[4]);
float f5(aux[5]);
float f6(aux[6]);
float f7(aux[7]);
float f8(aux[8]);
float f9(aux[9]);
float fa(aux[10]);
float fb(aux[11]);
float fc(aux[12]);
float fd(aux[13]);
float fe(aux[14]);
float ff(aux[15]);
return _mm512_set_ps(
ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
#endif
}
EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#ifdef EIGEN_HAS_FP16_C
Packet16h result;
result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
return result;
#else
EIGEN_ALIGN64 float aux[16];
pstore(aux, a);
half h0(aux[0]);
half h1(aux[1]);
half h2(aux[2]);
half h3(aux[3]);
half h4(aux[4]);
half h5(aux[5]);
half h6(aux[6]);
half h7(aux[7]);
half h8(aux[8]);
half h9(aux[9]);
half ha(aux[10]);
half hb(aux[11]);
half hc(aux[12]);
half hd(aux[13]);
half he(aux[14]);
half hf(aux[15]);
Packet16h result;
result.x = _mm256_set_epi16(
hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
return result;
#endif
}
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = padd(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = pmul(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
}
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
{
Packet16h result;
result.x = _mm256_set_epi16(
from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
return result;
}
template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
{
EIGEN_ALIGN64 half aux[16];
pstore(aux, from);
to[stride*0].x = aux[0].x;
to[stride*1].x = aux[1].x;
to[stride*2].x = aux[2].x;
to[stride*3].x = aux[3].x;
to[stride*4].x = aux[4].x;
to[stride*5].x = aux[5].x;
to[stride*6].x = aux[6].x;
to[stride*7].x = aux[7].x;
to[stride*8].x = aux[8].x;
to[stride*9].x = aux[9].x;
to[stride*10].x = aux[10].x;
to[stride*11].x = aux[11].x;
to[stride*12].x = aux[12].x;
to[stride*13].x = aux[13].x;
to[stride*14].x = aux[14].x;
to[stride*15].x = aux[15].x;
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,16>& kernel) {
__m256i a = kernel.packet[0].x;
__m256i b = kernel.packet[1].x;
__m256i c = kernel.packet[2].x;
__m256i d = kernel.packet[3].x;
__m256i e = kernel.packet[4].x;
__m256i f = kernel.packet[5].x;
__m256i g = kernel.packet[6].x;
__m256i h = kernel.packet[7].x;
__m256i i = kernel.packet[8].x;
__m256i j = kernel.packet[9].x;
__m256i k = kernel.packet[10].x;
__m256i l = kernel.packet[11].x;
__m256i m = kernel.packet[12].x;
__m256i n = kernel.packet[13].x;
__m256i o = kernel.packet[14].x;
__m256i p = kernel.packet[15].x;
__m256i ab_07 = _mm256_unpacklo_epi16(a, b);
__m256i cd_07 = _mm256_unpacklo_epi16(c, d);
__m256i ef_07 = _mm256_unpacklo_epi16(e, f);
__m256i gh_07 = _mm256_unpacklo_epi16(g, h);
__m256i ij_07 = _mm256_unpacklo_epi16(i, j);
__m256i kl_07 = _mm256_unpacklo_epi16(k, l);
__m256i mn_07 = _mm256_unpacklo_epi16(m, n);
__m256i op_07 = _mm256_unpacklo_epi16(o, p);
__m256i ab_8f = _mm256_unpackhi_epi16(a, b);
__m256i cd_8f = _mm256_unpackhi_epi16(c, d);
__m256i ef_8f = _mm256_unpackhi_epi16(e, f);
__m256i gh_8f = _mm256_unpackhi_epi16(g, h);
__m256i ij_8f = _mm256_unpackhi_epi16(i, j);
__m256i kl_8f = _mm256_unpackhi_epi16(k, l);
__m256i mn_8f = _mm256_unpackhi_epi16(m, n);
__m256i op_8f = _mm256_unpackhi_epi16(o, p);
__m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
__m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
__m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
__m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
__m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
__m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
__m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
__m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
__m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
__m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
__m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
__m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
__m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
__m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
__m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
__m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
__m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
__m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
__m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
__m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
__m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
__m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
__m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
__m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
__m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
__m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
__m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
__m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
__m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
__m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
__m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
__m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
kernel.packet[0].x = a_p_0;
kernel.packet[1].x = a_p_1;
kernel.packet[2].x = a_p_2;
kernel.packet[3].x = a_p_3;
kernel.packet[4].x = a_p_4;
kernel.packet[5].x = a_p_5;
kernel.packet[6].x = a_p_6;
kernel.packet[7].x = a_p_7;
kernel.packet[8].x = a_p_8;
kernel.packet[9].x = a_p_9;
kernel.packet[10].x = a_p_a;
kernel.packet[11].x = a_p_b;
kernel.packet[12].x = a_p_c;
kernel.packet[13].x = a_p_d;
kernel.packet[14].x = a_p_e;
kernel.packet[15].x = a_p_f;
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,8>& kernel) {
EIGEN_ALIGN64 half in[8][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
pstore<half>(in[2], kernel.packet[2]);
pstore<half>(in[3], kernel.packet[3]);
pstore<half>(in[4], kernel.packet[4]);
pstore<half>(in[5], kernel.packet[5]);
pstore<half>(in[6], kernel.packet[6]);
pstore<half>(in[7], kernel.packet[7]);
EIGEN_ALIGN64 half out[8][16];
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
out[i][j] = in[j][2*i];
}
for (int j = 0; j < 8; ++j) {
out[i][j+8] = in[j][2*i+1];
}
}
kernel.packet[0] = pload<Packet16h>(out[0]);
kernel.packet[1] = pload<Packet16h>(out[1]);
kernel.packet[2] = pload<Packet16h>(out[2]);
kernel.packet[3] = pload<Packet16h>(out[3]);
kernel.packet[4] = pload<Packet16h>(out[4]);
kernel.packet[5] = pload<Packet16h>(out[5]);
kernel.packet[6] = pload<Packet16h>(out[6]);
kernel.packet[7] = pload<Packet16h>(out[7]);
}
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet16h,4>& kernel) {
EIGEN_ALIGN64 half in[4][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
pstore<half>(in[2], kernel.packet[2]);
pstore<half>(in[3], kernel.packet[3]);
EIGEN_ALIGN64 half out[4][16];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
out[i][j] = in[j][4*i];
}
for (int j = 0; j < 4; ++j) {
out[i][j+4] = in[j][4*i+1];
}
for (int j = 0; j < 4; ++j) {
out[i][j+8] = in[j][4*i+2];
}
for (int j = 0; j < 4; ++j) {
out[i][j+12] = in[j][4*i+3];
}
}
kernel.packet[0] = pload<Packet16h>(out[0]);
kernel.packet[1] = pload<Packet16h>(out[1]);
kernel.packet[2] = pload<Packet16h>(out[2]);
kernel.packet[3] = pload<Packet16h>(out[3]);
}
#elif defined EIGEN_VECTORIZE_AVX
typedef struct {

View File

@ -100,6 +100,33 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(cons
return __floats2half2_rn(a.x, a.y);
}
#elif defined EIGEN_VECTORIZE_AVX512
template <>
struct type_casting_traits<half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
return half2float(a);
}
template <>
struct type_casting_traits<float, half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
return float2half(a);
}
#elif defined EIGEN_VECTORIZE_AVX
template <>

View File

@ -476,6 +476,26 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, co
return Packet2cf(_mm_castpd_ps(result));
}
template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
{
return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
}
template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
{
return pset1<Packet1cd>(b);
}
template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
{
return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
}
template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
{
return pset1<Packet1cd>(b);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -818,6 +818,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_ps(a,pset1<Packet4f>(b),1);
#else
return _mm_move_ss(a, _mm_load_ss(&b));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blend_pd(a,pset1<Packet2d>(b),1);
#else
return _mm_move_sd(a, _mm_load_sd(&b));
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
{
#ifdef EIGEN_VECTORIZE_SSE4_1

View File

@ -43,12 +43,17 @@ template <typename Scalar, typename Packet>
struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
{
linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), m_interPacket(plset<Packet>(0))
m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
m_interPacket(plset<Packet>(0)),
m_flip(std::abs(high)<std::abs(low))
{}
template<typename IndexType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
return (i==m_size1)? m_high : (m_low + i*m_step);
if(m_flip)
return (i==0)? m_low : (m_high - (m_size1-i)*m_step);
else
return (i==m_size1)? m_high : (m_low + i*m_step);
}
template<typename IndexType>
@ -56,11 +61,22 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
{
// Principle:
// [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if(i==m_size1-unpacket_traits<Packet>::size+1)
res = pinsertlast(res, m_high);
return res;
if(m_flip)
{
Packet pi = padd(pset1<Packet>(Scalar(i-m_size1)),m_interPacket);
Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
if(i==0)
res = pinsertfirst(res, m_low);
return res;
}
else
{
Packet pi = padd(pset1<Packet>(Scalar(i)),m_interPacket);
Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
if(i==m_size1-unpacket_traits<Packet>::size+1)
res = pinsertlast(res, m_high);
return res;
}
}
const Scalar m_low;
@ -68,6 +84,7 @@ struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
const Index m_size1;
const Scalar m_step;
const Packet m_interPacket;
const bool m_flip;
};
template <typename Scalar, typename Packet>

View File

@ -580,7 +580,7 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
}
template<typename Packet>
const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
{
return a;
}
@ -1581,10 +1581,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
if(SwappedTraits::LhsProgress==8)
{
// Special case where we have to first reduce the accumulation register C0
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
@ -1596,13 +1596,13 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
SRhsPacketHalf b0;
straits.loadLhsUnaligned(blB, a0);
straits.loadRhs(blA, b0);
SAccPacketHalf c0 = predux4(C0);
SAccPacketHalf c0 = predux_downto4(C0);
straits.madd(a0,b0,c0,b0);
straits.acc(c0, alphav, R);
}
else
{
straits.acc(predux4(C0), alphav, R);
straits.acc(predux_downto4(C0), alphav, R);
}
res.scatterPacket(i, j2, R);
}

View File

@ -13,7 +13,7 @@
#define EIGEN_WORLD_VERSION 3
#define EIGEN_MAJOR_VERSION 2
#define EIGEN_MINOR_VERSION 94
#define EIGEN_MINOR_VERSION 95
#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
(EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@ -659,6 +659,9 @@ namespace Eigen {
// If the user explicitly disable vectorization, then we also disable alignment
#if defined(EIGEN_DONT_VECTORIZE)
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
#elif defined(EIGEN_VECTORIZE_AVX512)
// 64 bytes static alignmeent is preferred only if really required
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
#elif defined(__AVX__)
// 32 bytes static alignmeent is preferred only if really required
#define EIGEN_IDEAL_MAX_ALIGN_BYTES 32

View File

@ -100,7 +100,8 @@
MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
THIS_TYPE_IS_NOT_SUPPORTED,
STORAGE_KIND_MUST_MATCH,
STORAGE_INDEX_MUST_MATCH
STORAGE_INDEX_MUST_MATCH,
CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY
};
};

View File

@ -114,7 +114,9 @@ template<typename MatrixType,int _Direction> class Homogeneous
/** \geometry_module \ingroup Geometry_Module
*
* \return an expression of the equivalent homogeneous vector
* \returns a vector expression that is one longer than the vector argument, with the value 1 symbolically appended as the last coefficient.
*
* This can be used to convert affine coordinates to homogeneous coordinates.
*
* \only_for_vectors
*
@ -133,7 +135,9 @@ MatrixBase<Derived>::homogeneous() const
/** \geometry_module \ingroup Geometry_Module
*
* \returns a matrix expression of homogeneous column (or row) vectors
* \returns an expression where the value 1 is symbolically appended as the final coefficient to each column (or row) of the matrix.
*
* This can be used to convert affine coordinates to homogeneous coordinates.
*
* Example: \include VectorwiseOp_homogeneous.cpp
* Output: \verbinclude VectorwiseOp_homogeneous.out
@ -148,7 +152,16 @@ VectorwiseOp<ExpressionType,Direction>::homogeneous() const
/** \geometry_module \ingroup Geometry_Module
*
* \returns an expression of the homogeneous normalized vector of \c *this
* \brief homogeneous normalization
*
* \returns a vector expression of the N-1 first coefficients of \c *this divided by that last coefficient.
*
* This can be used to convert homogeneous coordinates to affine coordinates.
*
* It is essentially a shortcut for:
* \code
this->head(this->size()-1)/this->coeff(this->size()-1);
\endcode
*
* Example: \include MatrixBase_hnormalized.cpp
* Output: \verbinclude MatrixBase_hnormalized.out
@ -166,7 +179,13 @@ MatrixBase<Derived>::hnormalized() const
/** \geometry_module \ingroup Geometry_Module
*
* \returns an expression of the homogeneous normalized vector of \c *this
* \brief column or row-wise homogeneous normalization
*
* \returns an expression of the first N-1 coefficients of each column (or row) of \c *this divided by the last coefficient of each column (or row).
*
* This can be used to convert homogeneous coordinates to affine coordinates.
*
* It is conceptually equivalent to calling MatrixBase::hnormalized() to each column (or row) of \c *this.
*
* Example: \include DirectionWise_hnormalized.cpp
* Output: \verbinclude DirectionWise_hnormalized.out

View File

@ -418,12 +418,18 @@ macro(ei_testing_print_summary)
message(STATUS "AVX: Using architecture defaults")
endif()
if(EIGEN_TEST_FMA)
if(EIGEN_TEST_FMA)
message(STATUS "FMA: ON")
else()
message(STATUS "FMA: Using architecture defaults")
endif()
if(EIGEN_TEST_AVX512)
message(STATUS "AVX512: ON")
else()
message(STATUS "AVX512: Using architecture defaults")
endif()
if(EIGEN_TEST_ALTIVEC)
message(STATUS "Altivec: ON")
else()

View File

@ -33,10 +33,38 @@ bool equalsIdentity(const MatrixType& A)
}
template<typename VectorType>
void check_extremity_accuracy(const VectorType &v, const typename VectorType::Scalar &low, const typename VectorType::Scalar &high)
{
typedef typename VectorType::Scalar Scalar;
typedef typename VectorType::RealScalar RealScalar;
RealScalar prec = internal::is_same<RealScalar,float>::value ? NumTraits<RealScalar>::dummy_precision()*10 : NumTraits<RealScalar>::dummy_precision()/10;
Index size = v.size();
if(size<20)
return;
for (int i=0; i<size; ++i)
{
if(i<5 || i>size-6)
{
Scalar ref = (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1);
if(std::abs(ref)>1)
{
if(!internal::isApprox(v(i), ref, prec))
std::cout << v(i) << " != " << ref << " ; relative error: " << std::abs((v(i)-ref)/ref) << " ; required precision: " << prec << " ; range: " << low << "," << high << " ; i: " << i << "\n";
VERIFY(internal::isApprox(v(i), (low*RealScalar(size-i-1))/RealScalar(size-1) + (high*RealScalar(i))/RealScalar(size-1), prec));
}
}
}
}
template<typename VectorType>
void testVectorType(const VectorType& base)
{
typedef typename VectorType::Scalar Scalar;
typedef typename VectorType::RealScalar RealScalar;
const Index size = base.size();
@ -47,6 +75,9 @@ void testVectorType(const VectorType& base)
// check low==high
if(internal::random<float>(0.f,1.f)<0.05f)
low = high;
// check abs(low) >> abs(high)
else if(size>2 && std::numeric_limits<RealScalar>::max_exponent10>0 && internal::random<float>(0.f,1.f)<0.1f)
low = -internal::random<Scalar>(1,2) * RealScalar(std::pow(RealScalar(10),std::numeric_limits<RealScalar>::max_exponent10/2));
const Scalar step = ((size == 1) ? 1 : (high-low)/(size-1));
@ -60,6 +91,8 @@ void testVectorType(const VectorType& base)
for (int i=0; i<size; ++i)
n(i) = low+i*step;
VERIFY_IS_APPROX(m,n);
CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
}
if((!NumTraits<Scalar>::IsInteger) || ((high-low)>=size && (Index(high-low)%(size-1))==0) || (Index(high-low+1)<size && (size%Index(high-low+1))==0))
@ -79,6 +112,8 @@ void testVectorType(const VectorType& base)
VERIFY( internal::isApprox(m(m.size()-1),high) );
VERIFY( size==1 || internal::isApprox(m(0),low) );
VERIFY_IS_EQUAL(m(m.size()-1) , high);
if(!NumTraits<Scalar>::IsInteger)
CALL_SUBTEST( check_extremity_accuracy(m, low, high) );
}
VERIFY( m(m.size()-1) <= high );
@ -154,10 +189,10 @@ void test_nullary()
CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random<int>(1,300),internal::random<int>(1,300))) );
for(int i = 0; i < g_repeat*10; i++) {
CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,300))) );
CALL_SUBTEST_4( testVectorType(VectorXd(internal::random<int>(1,30000))) );
CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232
CALL_SUBTEST_6( testVectorType(Vector3d()) );
CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,300))) );
CALL_SUBTEST_7( testVectorType(VectorXf(internal::random<int>(1,30000))) );
CALL_SUBTEST_8( testVectorType(Vector3f()) );
CALL_SUBTEST_8( testVectorType(Vector4f()) );
CALL_SUBTEST_8( testVectorType(Matrix<float,8,1>()) );

View File

@ -16,6 +16,12 @@
#endif
// using namespace Eigen;
#ifdef EIGEN_VECTORIZE_SSE
const bool g_vectorize_sse = true;
#else
const bool g_vectorize_sse = false;
#endif
namespace Eigen {
namespace internal {
template<typename T> T negate(const T& x) { return -x; }
@ -152,6 +158,14 @@ template<typename Scalar> void packetmath()
else if (offset==5) internal::palign<5>(packets[0], packets[1]);
else if (offset==6) internal::palign<6>(packets[0], packets[1]);
else if (offset==7) internal::palign<7>(packets[0], packets[1]);
else if (offset==8) internal::palign<8>(packets[0], packets[1]);
else if (offset==9) internal::palign<9>(packets[0], packets[1]);
else if (offset==10) internal::palign<10>(packets[0], packets[1]);
else if (offset==11) internal::palign<11>(packets[0], packets[1]);
else if (offset==12) internal::palign<12>(packets[0], packets[1]);
else if (offset==13) internal::palign<13>(packets[0], packets[1]);
else if (offset==14) internal::palign<14>(packets[0], packets[1]);
else if (offset==15) internal::palign<15>(packets[0], packets[1]);
internal::pstore(data2, packets[0]);
for (int i=0; i<PacketSize; ++i)
@ -238,8 +252,8 @@ template<typename Scalar> void packetmath()
ref[i] = 0;
for (int i=0; i<PacketSize; ++i)
ref[i%4] += data1[i];
internal::pstore(data2, internal::predux4(internal::pload<Packet>(data1)));
VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux4");
internal::pstore(data2, internal::predux_downto4(internal::pload<Packet>(data1)));
VERIFY(areApprox(ref, data2, PacketSize>4?PacketSize/2:PacketSize) && "internal::predux_downto4");
}
ref[0] = 1;
@ -290,7 +304,17 @@ template<typename Scalar> void packetmath()
}
}
if (PacketTraits::HasBlend) {
if (PacketTraits::HasBlend || g_vectorize_sse) {
// pinsertfirst
for (int i=0; i<PacketSize; ++i)
ref[i] = data1[i];
Scalar s = internal::random<Scalar>();
ref[0] = s;
internal::pstore(data2, internal::pinsertfirst(internal::pload<Packet>(data1),s));
VERIFY(areApprox(ref, data2, PacketSize) && "internal::pinsertfirst");
}
if (PacketTraits::HasBlend || g_vectorize_sse) {
// pinsertlast
for (int i=0; i<PacketSize; ++i)
ref[i] = data1[i];

View File

@ -10,6 +10,8 @@
#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
#define EIGEN_SPECIALFUNCTIONS_MODULE
#include <math.h>
#include "../../Eigen/Core"
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"

View File

@ -121,7 +121,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int signgam;
return ::lgammaf_r(x, &signgam);
#else
@ -134,7 +134,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
int signgam;
return ::lgamma_r(x, &signgam);
#else