* patch from Konstantinos Margaritis: bugfix in Altivec version of ei_pdiv

and various cleaning in Altivec code. Altivec vectorization have been re-enabled
  in CoreDeclaration
* added copy constructors in non empty functors because I observed weird behavior with
  std::complex<>
This commit is contained in:
Gael Guennebaud 2008-08-25 16:22:56 +00:00
parent 7ce70e1437
commit 8f9d30cb20
4 changed files with 56 additions and 61 deletions

View File

@ -23,15 +23,12 @@
#ifdef __SSSE3__ #ifdef __SSSE3__
#include <tmmintrin.h> #include <tmmintrin.h>
#endif #endif
/*** Disable AltiVec code for now as it's out of date
#elif (defined __ALTIVEC__) #elif (defined __ALTIVEC__)
#define EIGEN_VECTORIZE #define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ALTIVEC #define EIGEN_VECTORIZE_ALTIVEC
#include <altivec.h> #include <altivec.h>
// We _need_ to #undef bool as it's defined in <altivec.h> for some reason. // We _need_ to #undef bool as it's defined in <altivec.h> for some reason.
#undef bool #undef bool
***/
#endif #endif
#endif #endif

View File

@ -36,6 +36,8 @@
template<typename Scalar> template<typename Scalar>
struct ei_scalar_add_op { struct ei_scalar_add_op {
typedef typename ei_packet_traits<Scalar>::type PacketScalar; typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { }
inline ei_scalar_add_op(const Scalar& other) : m_other(other) { } inline ei_scalar_add_op(const Scalar& other) : m_other(other) { }
inline Scalar operator() (const Scalar& a) const { return a + m_other; } inline Scalar operator() (const Scalar& a) const { return a + m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const inline const PacketScalar packetOp(const PacketScalar& a) const
@ -131,6 +133,8 @@ struct ei_functor_traits<ei_scalar_sin_op<Scalar> >
*/ */
template<typename Scalar> template<typename Scalar>
struct ei_scalar_pow_op { struct ei_scalar_pow_op {
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_pow_op(const ei_scalar_pow_op& other) : m_exponent(other.m_exponent) { }
inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {} inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); } inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); }
const Scalar m_exponent; const Scalar m_exponent;

View File

@ -143,8 +143,6 @@ struct ei_functor_traits<ei_scalar_quotient_op<Scalar> > {
PacketAccess = ei_packet_traits<Scalar>::size>1 PacketAccess = ei_packet_traits<Scalar>::size>1
#if (defined EIGEN_VECTORIZE_SSE) #if (defined EIGEN_VECTORIZE_SSE)
&& NumTraits<Scalar>::HasFloatingPoint && NumTraits<Scalar>::HasFloatingPoint
#elif (defined EIGEN_VECTORIZE_ALTIVEC)
&& 0
#endif #endif
}; };
}; };
@ -260,6 +258,8 @@ struct ei_functor_traits<ei_scalar_real_op<Scalar> >
template<typename Scalar> template<typename Scalar>
struct ei_scalar_multiple_op { struct ei_scalar_multiple_op {
typedef typename ei_packet_traits<Scalar>::type PacketScalar; typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { }
inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { } inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { }
inline Scalar operator() (const Scalar& a) const { return a * m_other; } inline Scalar operator() (const Scalar& a) const { return a * m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const inline const PacketScalar packetOp(const PacketScalar& a) const
@ -273,6 +273,8 @@ struct ei_functor_traits<ei_scalar_multiple_op<Scalar> >
template<typename Scalar, bool HasFloatingPoint> template<typename Scalar, bool HasFloatingPoint>
struct ei_scalar_quotient1_impl { struct ei_scalar_quotient1_impl {
typedef typename ei_packet_traits<Scalar>::type PacketScalar; typedef typename ei_packet_traits<Scalar>::type PacketScalar;
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {} inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast<Scalar>(1) / other) {}
inline Scalar operator() (const Scalar& a) const { return a * m_other; } inline Scalar operator() (const Scalar& a) const { return a * m_other; }
inline const PacketScalar packetOp(const PacketScalar& a) const inline const PacketScalar packetOp(const PacketScalar& a) const
@ -285,10 +287,11 @@ struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,true> >
template<typename Scalar> template<typename Scalar>
struct ei_scalar_quotient1_impl<Scalar,false> { struct ei_scalar_quotient1_impl<Scalar,false> {
// FIXME default copy constructors seems bugged with std::complex<>
inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { }
inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {} inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {}
inline Scalar operator() (const Scalar& a) const { return a / m_other; } inline Scalar operator() (const Scalar& a) const { return a / m_other; }
const Scalar m_other; const Scalar m_other;
enum { Cost = 2 * NumTraits<Scalar>::MulCost };
}; };
template<typename Scalar> template<typename Scalar>
struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> > struct ei_functor_traits<ei_scalar_quotient1_impl<Scalar,false> >

View File

@ -2,7 +2,6 @@
// for linear algebra. Eigen itself is part of the KDE project. // for linear algebra. Eigen itself is part of the KDE project.
// //
// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr> // Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
// //
// Eigen is free software; you can redistribute it and/or // Eigen is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
@ -35,11 +34,16 @@ typedef vector int v4i;
typedef vector unsigned int v4ui; typedef vector unsigned int v4ui;
typedef vector __bool int v4bi; typedef vector __bool int v4bi;
static const v4i v0i = vec_splat_s32(0); // We don't want to write the same code all the time, but we need to reuse the constants
static const v4i v1i = vec_splat_s32(1); // and it doesn't really work to declare them global, so we define macros instead
static const v4i v16i_ = vec_splat_s32(-16);
static const v4f v0f = (v4f) v0i; #define USE_CONST_v0i const v4i v0i = vec_splat_s32(0)
static const v4f v1f = (v4f) v1i; #define USE_CONST_v1i const v4i v1i = vec_splat_s32(1)
#define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16)
#define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i;
#define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0);
#define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1);
#define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_);
template<> struct ei_packet_traits<float> { typedef v4f type; enum {size=4}; }; template<> struct ei_packet_traits<float> { typedef v4f type; enum {size=4}; };
template<> struct ei_packet_traits<int> { typedef v4i type; enum {size=4}; }; template<> struct ei_packet_traits<int> { typedef v4i type; enum {size=4}; };
@ -54,7 +58,7 @@ std::ostream & operator <<(std::ostream & s, const v4f & v)
float n[4]; float n[4];
} vt; } vt;
vt.v = v; vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s; return s;
} }
@ -65,7 +69,7 @@ std::ostream & operator <<(std::ostream & s, const v4i & v)
int n[4]; int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s; return s;
} }
@ -76,7 +80,7 @@ std::ostream & operator <<(std::ostream & s, const v4ui & v)
unsigned int n[4]; unsigned int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s; return s;
} }
@ -87,7 +91,7 @@ std::ostream & operator <<(std::ostream & s, const v4bi & v)
unsigned int n[4]; unsigned int n[4];
} vt; } vt;
vt.v = v; vt.v = v;
s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
return s; return s;
} }
@ -97,71 +101,57 @@ template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add(
template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); } template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); }
template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); } template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); }
template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { return vec_madd(a,b, v0f); } template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); }
template<> inline v4i ei_pmul(const v4i& a, const v4i& b) template<> inline v4i ei_pmul(const v4i& a, const v4i& b)
{ {
// Taken from http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Multiply32 // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
//Set up constants //Set up constants, variables
v4i bswap, low_prod, high_prod, prod, prod_, a_, b_, a1, b1; v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
v4i v1_, v0_, v1sel; USE_CONST_v0i;
v1_ = (v4i) vec_splat_u32(-1); USE_CONST_v1i;
v0_ = (v4i) vec_sl((v4ui) v1_, (v4ui) v1_); USE_CONST_v16i_;
// std::cout << "a: " << a << std::endl; // Get the absolute values
// std::cout << "b: " << b << std::endl; a1 = vec_abs(a);
a_ = vec_sub(v0i, a); b1 = vec_abs(b);
b_ = vec_sub(v0i, b);
a1 = vec_max(a, a_);
b1 = vec_max(b, b_);
v4bi sgn = (v4bi) vec_cmplt(vec_and(vec_xor(a, b), v0_), v0i);
// std::cout << "a: " << a1 << std::endl; // Get the signs using xor
// std::cout << "b: " << b1 << std::endl; v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i);
// std::cout << "a_: " << a_ << std::endl;
// std::cout << "b_: " << b_ << std::endl;
// std::cout << "sgn(a*b): " << std::hex << sgn << std::dec << std::endl;
// Do real work // Do the multiplication for the asbolute values.
bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ ); bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ );
low_prod = vec_mulo((vector short)a1, (vector short)b1); low_prod = vec_mulo((vector short)a1, (vector short)b1);
high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i); high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i);
high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_); high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_);
prod = vec_add( low_prod, high_prod ); prod = vec_add( low_prod, high_prod );
// std::cout << "prod: " << prod << std::endl;
// std::cout << "v0i: " << v0i << std::endl; // NOR the product and select only the negative elements according to the sign mask
prod_ = vec_nor(prod, prod); prod_ = vec_nor(prod, prod);
prod_ = vec_sel(v0i, prod_, sgn); prod_ = vec_sel(v0i, prod_, sgn);
// Add 1 to the result to get the negative numbers
v1sel = vec_sel(v0i, v1i, sgn); v1sel = vec_sel(v0i, v1i, sgn);
// std::cout << "prod_: " << prod_ << std::endl;
// std::cout << "v1sel: " << v1sel << std::endl;
prod_ = vec_add(prod_, v1sel); prod_ = vec_add(prod_, v1sel);
// std::cout << "prod_: " << prod_ << std::endl;
// Merge the results back to the final vector.
prod = vec_sel(prod, prod_, sgn); prod = vec_sel(prod, prod_, sgn);
// std::cout << "final prod: " << prod << std::endl;
return prod; return prod;
} }
template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) { template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) {
std::cout << "a: " << a << std::endl; v4f t, y_0, y_1, res;
std::cout << "b: " << b << std::endl; USE_CONST_v0f;
USE_CONST_v1f;
// Altivec does not offer a divide instruction, we have to do a reciprocal approximation // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
v4f y = vec_re(b); y_0 = vec_re(b);
std::cout << "1/b: " << y << std::endl;
// Set up some constants for inverse reciprocals // Do one Newton-Raphson iteration to get the needed accuracy
vector unsigned int v1_; t = vec_nmsub(y_0, b, v1f);
v4f v0_, t; y_1 = vec_madd(y_0, t, y_0);
v1_ = vec_splat_u32(-1);
std::cout << "-1: " << (v4i) v1_ << std::endl; res = vec_madd(a, y_1, v0f);
v0_ = (v4f) vec_sl(v1_, v1_);
std::cout << "-0.0: " << v0_ << std::endl;
// Do a Newton-Raphson iteration to get the needed accuracy
t = vec_nmsub(y, b, v1f);
std::cout << "t: " << t << std::endl;
y = vec_nmsub(t, y, v0_);
v4f res = vec_madd(a, y, v0_);
std::cout << "res: " << res << std::endl;
return res; return res;
} }
@ -336,6 +326,7 @@ inline v4i ei_preduxp(const v4i* vecs)
inline int ei_predux(const v4i& a) inline int ei_predux(const v4i& a)
{ {
USE_CONST_v0i;
v4i sum; v4i sum;
sum = vec_sums(a, v0i); sum = vec_sums(a, v0i);
sum = vec_sld(sum, v0i, 12); sum = vec_sld(sum, v0i, 12);