merged incoming udpates

2025-09-26 16:26:48 +08:00 · 2014-03-20 22:11:13 +08:00 · 2014-03-20 22:11:13 +08:00 · e3fb190edf
commit e3fb190edf
parent cfd3d6ce9c c39a3fa7a1
4 changed files with 93 additions and 111 deletions
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -110,7 +110,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+
+// GCC generates a shufps instruction for set1_ps instead of the more efficient pshufd instruction.
+// However, with AVX, we want it to generate a vbroadcastss.
+// Moreover, we cannot use intrinsics here because then gcc generates crappy code in some cases (see bug 203)
+#if (defined __GNUC__) && (!defined __INTEL_COMPILER) && (!defined __clang__) && (!defined __AVX__)
+  template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
+    Packet4f res;
+    asm("pshufd $0, %[a], %[b]" : [b] "=x" (res) : [a] "x" (from));
+    return res;
+  }
+#else
+  template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -700,98 +700,42 @@ template<typename T> class aligned_stack_memory_handler
 * \sa \ref TopicStlContainers.
 */
 template<class T>
-class aligned_allocator
+class aligned_allocator : public std::allocator<T>
 {
 public:
-    typedef size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
+  typedef size_t          size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef T*              pointer;
+  typedef const T*        const_pointer;
+  typedef T&              reference;
+  typedef const T&        const_reference;
+  typedef T               value_type;

-    template<class U>
-    struct rebind
-    {
-        typedef aligned_allocator<U> other;
-    };
+  template<class U>
+  struct rebind
+  {
+    typedef aligned_allocator<U> other;
+  };

-    pointer address( reference value ) const
-    {
-        return &value;
-    }
+  aligned_allocator() : std::allocator<T>() {}

-    const_pointer address( const_reference value ) const
-    {
-        return &value;
-    }
+  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}

-    aligned_allocator()
-    {
-    }
+  template<class U>
+  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}

-    aligned_allocator( const aligned_allocator& )
-    {
-    }
+  ~aligned_allocator() {}

-    template<class U>
-    aligned_allocator( const aligned_allocator<U>& )
-    {
-    }
+  pointer allocate(size_type num, const void* /*hint*/ = 0)
+  {
+    internal::check_size_for_overflow<T>(num);
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
+  }

-    ~aligned_allocator()
-    {
-    }
-
-    size_type max_size() const
-    {
-        return (std::numeric_limits<size_type>::max)();
-    }
-
-    pointer allocate( size_type num, const void* hint = 0 )
-    {
-        EIGEN_UNUSED_VARIABLE(hint);
-        internal::check_size_for_overflow<T>(num);
-        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
-    }
-
-    void construct( pointer p, const T& value )
-    {
-        ::new( p ) T( value );
-    }
-
-#if (__cplusplus >= 201103L)
-    template <typename U, typename... Args>
-    void construct( U* u, Args&&... args)
-    {
-        ::new( static_cast<void*>(u) ) U( std::forward<Args>( args )... );
-    }
-#endif
-
-    void destroy( pointer p )
-    {
-        p->~T();
-    }
-
-#if (__cplusplus >= 201103L)
-    template <typename U>
-    void destroy( U* u )
-    {
-        u->~U();
-    }
-#endif
-
-    void deallocate( pointer p, size_type /*num*/ )
-    {
-        internal::aligned_free( p );
-    }
-
-    bool operator!=(const aligned_allocator<T>& ) const
-    { return false; }
-
-    bool operator==(const aligned_allocator<T>& ) const
-    { return true; }
+  void deallocate(pointer p, size_type /*num*/)
+  {
+    internal::aligned_free(p);
+  }
 };

 //---------- Cache sizes ----------
--- a/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@ -37,22 +37,31 @@ namespace Eigen {
            typedef typename Dest::Scalar Scalar;
            typedef Matrix<Scalar,Dynamic,1> VectorType;

+            // Check for zero rhs
+            const RealScalar rhsNorm2(rhs.squaredNorm());
+            if(rhsNorm2 == 0)
+            {
+                x.setZero();
+                iters = 0;
+                tol_error = 0;
+                return;
+            }
+            
            // initialize
            const int maxIters(iters);  // initialize maxIters to iters
            const int N(mat.cols());    // the size of the matrix
-            const RealScalar rhsNorm2(rhs.squaredNorm());
            const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
            
            // Initialize preconditioned Lanczos
-//            VectorType v_old(N); // will be initialized inside loop
+            VectorType v_old(N); // will be initialized inside loop
            VectorType v( VectorType::Zero(N) ); //initialize v
            VectorType v_new(rhs-mat*x); //initialize v_new
            RealScalar residualNorm2(v_new.squaredNorm());
-//            VectorType w(N); // will be initialized inside loop
+            VectorType w(N); // will be initialized inside loop
            VectorType w_new(precond.solve(v_new)); // initialize w_new
 //            RealScalar beta; // will be initialized inside loop
            RealScalar beta_new2(v_new.dot(w_new));
-            eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+            eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
            RealScalar beta_new(sqrt(beta_new2));
            const RealScalar beta_one(beta_new);
            v_new /= beta_new;
@ -62,14 +71,14 @@ namespace Eigen {
            RealScalar c_old(1.0);
            RealScalar s(0.0); // the sine of the Givens rotation
            RealScalar s_old(0.0); // the sine of the Givens rotation
-//            VectorType p_oold(N); // will be initialized in loop
+            VectorType p_oold(N); // will be initialized in loop
            VectorType p_old(VectorType::Zero(N)); // initialize p_old=0
            VectorType p(p_old); // initialize p=0
            RealScalar eta(1.0);
                        
            iters = 0; // reset iters
-            while ( iters < maxIters ){
-                
+            while ( iters < maxIters )
+            {
                // Preconditioned Lanczos
                /* Note that there are 4 variants on the Lanczos algorithm. These are
                 * described in Paige, C. C. (1972). Computational variants of
@ -81,17 +90,17 @@ namespace Eigen {
                 * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
                 */
                const RealScalar beta(beta_new);
-//                v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
+                v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
+//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
                v = v_new; // update
-//                w = w_new; // update
-                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
+                w = w_new; // update
+//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
                v_new.noalias() = mat*w - beta*v_old; // compute v_new
                const RealScalar alpha = v_new.dot(w);
                v_new -= alpha*v; // overwrite v_new
                w_new = precond.solve(v_new); // overwrite w_new
                beta_new2 = v_new.dot(w_new); // compute beta_new
-                eigen_assert(beta_new2 >= 0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+                eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
                beta_new = sqrt(beta_new2); // compute beta_new
                v_new /= beta_new; // overwrite v_new for next iteration
                w_new /= beta_new; // overwrite w_new for next iteration
@ -107,28 +116,34 @@ namespace Eigen {
                s=beta_new/r1; // new sine
                
                // Update solution
-//                p_oold = p_old;
-                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
+                p_oold = p_old;
+//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
                p_old = p;
                p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
                x += beta_one*c*eta*p;
+                
+                /* Update the squared residual. Note that this is the estimated residual.
+                The real residual |Ax-b|^2 may be slightly larger */
                residualNorm2 *= s*s;
                
-                if ( residualNorm2 < threshold2){
+                if ( residualNorm2 < threshold2)
+                {
                    break;
                }
                
                eta=-s*eta; // update eta
                iters++; // increment iteration number (for output purposes)
            }
-            tol_error = std::sqrt(residualNorm2 / rhsNorm2); // return error. Note that this is the estimated error. The real error |Ax-b|/|b| may be slightly larger 
+            
+            /* Compute error. Note that this is the estimated error. The real 
+             error |Ax-b|/|b| may be slightly larger */
+            tol_error = std::sqrt(residualNorm2 / rhsNorm2);
        }
        
    }
    
    template< typename _MatrixType, int _UpLo=Lower,
    typename _Preconditioner = IdentityPreconditioner>
-//    typename _Preconditioner = IdentityPreconditioner<typename _MatrixType::Scalar> > // preconditioner must be positive definite
    class MINRES;
    
    namespace internal {
--- a/unsupported/test/minres.cpp
+++ b/unsupported/test/minres.cpp
@ -1,8 +1,8 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@ -14,19 +14,29 @@

 template<typename T> void test_minres_T()
 {
-  MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_diag;
-  MINRES<SparseMatrix<T>, Lower, IdentityPreconditioner    > minres_colmajor_I;
-//  MINRES<SparseMatrix<T>, Lower, IncompleteLUT<T> >           minres_colmajor_ilut;
-  //minres<SparseMatrix<T>, SSORPreconditioner<T> >     minres_colmajor_ssor;
+  // Identity preconditioner
+  MINRES<SparseMatrix<T>, Lower, IdentityPreconditioner    > minres_colmajor_lower_I;
+  MINRES<SparseMatrix<T>, Upper, IdentityPreconditioner    > minres_colmajor_upper_I;
+
+  // Diagonal preconditioner
+  MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
+  MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
+  
+  // call tests for SPD matrix
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_I) );
+    
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
+    
+  // TO DO: symmetric semi-definite matrix
+  // TO DO: symmetric indefinite matrix

-  CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_diag)  );
-  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_I) );
- // CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ilut)     );
-  //CALL_SUBTEST( check_sparse_square_solving(minres_colmajor_ssor)     );
 }

 void test_minres()
 {
  CALL_SUBTEST_1(test_minres_T<double>());
-//  CALL_SUBTEST_2(test_minres_T<std::complex<double> >());
+//  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
+
 }