From 7ed9441ea472031bb9357b5bec80151cae7ed2cb Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 18 Feb 2014 18:06:44 -0800 Subject: [PATCH] Reverted the definition of the EIGEN_ALIGN to its former meaning (i.e. a boolean) Created a new EIGEN_ALIGN_BYTES define to encode how the data should be aligned Fixed a few remaining alignment issues exposed when the Eigen code is compiled with avx enabled. Created a new EIGEN_ALIGN_DEFAULT define, which is set to the minimum alignment value required for the chosen instruction set. Use this value instead of EIGEN_ALIGN32 to preserve the existing alignment on SSE/Altivec/Neon. --- Eigen/src/Core/Block.h | 2 +- Eigen/src/Core/DenseStorage.h | 4 +-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/Map.h | 2 +- Eigen/src/Core/MapBase.h | 2 +- Eigen/src/Core/products/GeneralMatrixMatrix.h | 6 ++-- Eigen/src/Core/products/GeneralMatrixVector.h | 4 +-- Eigen/src/Core/util/Macros.h | 18 ++++++++--- Eigen/src/Core/util/Memory.h | 32 +++++++++---------- Eigen/src/Core/util/XprHelper.h | 2 +- test/geo_parametrizedline.cpp | 6 ++-- test/mapped_matrix.cpp | 6 ++-- test/packetmath.cpp | 28 ++++++++-------- 13 files changed, 61 insertions(+), 53 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 31cd5c72c..e948e14aa 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -83,7 +83,7 @@ struct traits > : traits::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 2342b08a1..7264b44c7 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -40,7 +40,7 @@ void check_static_allocation_size() */ template struct plain_array { @@ -81,7 +81,7 @@ struct plain_array #endif template -struct plain_array +struct plain_array { EIGEN_USER_ALIGN32 T array[Size]; diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index e3a165ac6..adda6f784 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -397,7 +397,7 @@ struct gemv_static_vector_if internal::plain_array m_data; EIGEN_STRONG_INLINE Scalar* data() { return ForceAlignment - ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(15))) + 16) + ? reinterpret_cast((reinterpret_cast(m_data.array) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) : m_data.array; } #endif diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 8ea13cfb7..c75a5e95f 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -88,7 +88,7 @@ struct traits > && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ), + && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index ffa1371c2..a45a0b374 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -164,7 +164,7 @@ template class MapBase EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits::Flags&PacketAccessBit, internal::inner_stride_at_compile_time::ret==1), PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1); - eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % 16) == 0) + eigen_assert(EIGEN_IMPLIES(internal::traits::Flags&AlignedBit, (size_t(m_data) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index eb399a824..3dfd239c1 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -286,9 +286,9 @@ class gemm_blocking_space(tmp0); const LhsScalar* lhs0 = lhs + i*lhsStride; // process first unaligned result's coeffs diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 787f800b8..d6d5bfa23 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -66,17 +66,22 @@ #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. +#define EIGEN_ALIGN_BYTES 16 + #ifdef EIGEN_DONT_ALIGN #ifndef EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY #endif #define EIGEN_ALIGN 0 -#else - #if !defined(EIGEN_DONT_VECTORIZE) && defined(__AVX__) - #define EIGEN_ALIGN 32 - #else - #define EIGEN_ALIGN 16 +#elif !defined(EIGEN_DONT_VECTORIZE) + #if defined(__AVX__) + #undef EIGEN_ALIGN_BYTES + #define EIGEN_ALIGN_BYTES 32 #endif + #define EIGEN_ALIGN 1 #endif // EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable @@ -286,15 +291,18 @@ #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) #define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN_DEFAULT EIGEN_ALIGN_TO_BOUNDARY(EIGEN_ALIGN_BYTES) #if EIGEN_ALIGN_STATICALLY #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 EIGEN_ALIGN16 #define EIGEN_USER_ALIGN32 EIGEN_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT EIGEN_ALIGN_DEFAULT #else #define EIGEN_USER_ALIGN_TO_BOUNDARY(n) #define EIGEN_USER_ALIGN16 #define EIGEN_USER_ALIGN32 +#define EIGEN_USER_ALIGN_DEFAULT #endif #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 76bdb6cfc..2f2398bbf 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -32,7 +32,7 @@ // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed // quite safe, at least within the context of glibc, to equate 64-bit with LP64. #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \ - && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN == 16) + && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0 @@ -42,14 +42,14 @@ // See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures // See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup -#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN == 16) +#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN_BYTES == 16) #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1 #else #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0 #endif -#if (defined(__APPLE__) && (EIGEN_ALIGN == 16)) \ - || (defined(_WIN64) && (EIGEN_ALIGN == 16)) \ +#if (defined(__APPLE__) && (EIGEN_ALIGN_BYTES == 16)) \ + || (defined(_WIN64) && (EIGEN_ALIGN_BYTES == 16)) \ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \ || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED #define EIGEN_MALLOC_ALREADY_ALIGNED 1 @@ -105,9 +105,9 @@ inline void throw_std_bad_alloc() */ inline void* handmade_aligned_malloc(std::size_t size) { - void *original = std::malloc(size+EIGEN_ALIGN); + void *original = std::malloc(size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = if (ptr == 0) return handmade_aligned_malloc(size); void *original = *(reinterpret_cast(ptr) - 1); std::ptrdiff_t previous_offset = static_cast(ptr)-static_cast(original); - original = std::realloc(original,size+EIGEN_ALIGN); + original = std::realloc(original,size+EIGEN_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES); void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); @@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size) #elif EIGEN_MALLOC_ALREADY_ALIGNED result = std::malloc(size); #elif EIGEN_HAS_POSIX_MEMALIGN - if(posix_memalign(&result, EIGEN_ALIGN, size)) result = 0; + if(posix_memalign(&result, EIGEN_ALIGN_BYTES, size)) result = 0; #elif EIGEN_HAS_MM_MALLOC - result = _mm_malloc(size, EIGEN_ALIGN); + result = _mm_malloc(size, EIGEN_ALIGN_BYTES); #elif defined(_MSC_VER) && (!defined(_WIN32_WCE)) - result = _aligned_malloc(size, EIGEN_ALIGN); + result = _aligned_malloc(size, EIGEN_ALIGN_BYTES); #else result = handmade_aligned_malloc(size); #endif @@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) // implements _mm_malloc/_mm_free based on the corresponding _aligned_ // functions. This may not always be the case and we just try to be safe. #if defined(_MSC_VER) && defined(_mm_free) - result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = generic_aligned_realloc(ptr,new_size,old_size); #endif #elif defined(_MSC_VER) - result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN); + result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN_BYTES); #else result = handmade_aligned_realloc(ptr,new_size,old_size); #endif @@ -608,8 +608,8 @@ template class aligned_stack_memory_handler */ #ifdef EIGEN_ALLOCA // The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled. - #if defined(__arm__) || EIGEN_ALIGN > 16 - #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN)) & ~(size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN) + #if defined(__arm__) || EIGEN_ALIGN_BYTES > 16 + #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast((reinterpret_cast(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN_BYTES)) & ~(size_t(EIGEN_ALIGN_BYTES-1))) + EIGEN_ALIGN_BYTES) #else #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA #endif @@ -679,7 +679,7 @@ template class aligned_stack_memory_handler #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN==0))) + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN_BYTES==0))) /****************************************************************************/ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 195d9e2e1..a08538aff 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -136,7 +136,7 @@ class compute_matrix_flags ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) #else 0 #endif diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index f0462d40a..5a72b3575 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -66,9 +66,9 @@ template void parametrizedline_alignment() typedef ParametrizedLine Line4a; typedef ParametrizedLine Line4u; - EIGEN_ALIGN16 Scalar array1[8]; - EIGEN_ALIGN16 Scalar array2[8]; - EIGEN_ALIGN16 Scalar array3[8+1]; + EIGEN_ALIGN_DEFAULT Scalar array1[8]; + EIGEN_ALIGN_DEFAULT Scalar array2[8]; + EIGEN_ALIGN_DEFAULT Scalar array3[8+1]; Scalar* array3u = array3+1; Line4a *p1 = ::new(reinterpret_cast(array1)) Line4a; diff --git a/test/mapped_matrix.cpp b/test/mapped_matrix.cpp index c18e687a5..5eba3ecb3 100644 --- a/test/mapped_matrix.cpp +++ b/test/mapped_matrix.cpp @@ -26,7 +26,7 @@ template void map_class_vector(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Scalar array4[EIGEN_TESTMAP_MAX_SIZE]; Map(array1, size) = VectorType::Random(size); @@ -64,7 +64,7 @@ template void map_class_matrix(const MatrixType& m) for(int i = 0; i < size; i++) array2[i] = Scalar(1); Scalar* array3 = new Scalar[size+1]; for(int i = 0; i < size+1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; Map(array1, rows, cols) = MatrixType::Ones(rows,cols); Map(array2, rows, cols) = Map(array1, rows, cols); Map(array3unaligned, rows, cols) = Map(array1, rows, cols); @@ -90,7 +90,7 @@ template void map_static_methods(const VectorType& m) Scalar* array1 = internal::aligned_new(size); Scalar* array2 = internal::aligned_new(size); Scalar* array3 = new Scalar[size+1]; - Scalar* array3unaligned = size_t(array3)%16 == 0 ? array3+1 : array3; + Scalar* array3unaligned = size_t(array3)%EIGEN_ALIGN_BYTES == 0 ? array3+1 : array3; VectorType::MapAligned(array1, size) = VectorType::Random(size); VectorType::Map(array2, size) = VectorType::Map(array1, size); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d7c336c22..5a680d1ee 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -106,10 +106,10 @@ template void packetmath() const int max_size = PacketSize > 4 ? PacketSize : 4; const int size = PacketSize*max_size; - EIGEN_ALIGN32 Scalar data1[size]; - EIGEN_ALIGN32 Scalar data2[size]; - EIGEN_ALIGN32 Packet packets[PacketSize*2]; - EIGEN_ALIGN32 Scalar ref[size]; + EIGEN_ALIGN_DEFAULT Scalar data1[size]; + EIGEN_ALIGN_DEFAULT Scalar data2[size]; + EIGEN_ALIGN_DEFAULT Packet packets[PacketSize*2]; + EIGEN_ALIGN_DEFAULT Scalar ref[size]; RealScalar refvalue = 0; for (int i=0; i void packetmath_real() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; for (int i=0; i void packetmath_notcomplex() typedef typename internal::packet_traits::type Packet; const int PacketSize = internal::packet_traits::size; - EIGEN_ALIGN32 Scalar data1[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar data2[internal::packet_traits::size*4]; - EIGEN_ALIGN32 Scalar ref[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[internal::packet_traits::size*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[internal::packet_traits::size*4]; Array::Map(data1, internal::packet_traits::size*4).setRandom(); @@ -322,10 +322,10 @@ template void packetmath_complex() const int PacketSize = internal::packet_traits::size; const int size = PacketSize*4; - EIGEN_ALIGN32 Scalar data1[PacketSize*4]; - EIGEN_ALIGN32 Scalar data2[PacketSize*4]; - EIGEN_ALIGN32 Scalar ref[PacketSize*4]; - EIGEN_ALIGN32 Scalar pval[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data1[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar data2[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar ref[PacketSize*4]; + EIGEN_ALIGN_DEFAULT Scalar pval[PacketSize*4]; for (int i=0; i