diff --git a/.hgignore b/.hgignore index dcd9f4431..ebbf746bf 100644 --- a/.hgignore +++ b/.hgignore @@ -13,7 +13,7 @@ core core.* *.bak *~ -build* +*build* *.moc.* *.moc ui_* diff --git a/CMakeLists.txt b/CMakeLists.txt index fe4227cbb..6d74709a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ") endif() + # Alias Eigen_*_DIR to Eigen3_*_DIR: set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR}) @@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_ set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}") set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}) -# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty, -# but won't stop CMake. -execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) -execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +# if we are not in a mercurial clone +if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg) + # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty, + # but won't stop CMake. + execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) + execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) +endif() # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output... if(EIGEN_BRANCH_OUTPUT MATCHES "default") @@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows) option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON) endif() -set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_INCLUDE_CURRENT_DIR OFF) option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON) @@ -153,11 +157,7 @@ if(NOT MSVC) ei_add_cxx_compiler_flag("-Wdouble-promotion") # ei_add_cxx_compiler_flag("-Wconversion") - # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 - # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0")) - if(NOT CMAKE_COMPILER_IS_GNUCXX) - ei_add_cxx_compiler_flag("-Wshadow") - endif() + ei_add_cxx_compiler_flag("-Wshadow") ei_add_cxx_compiler_flag("-Wno-psabi") ei_add_cxx_compiler_flag("-Wno-variadic-macros") @@ -232,7 +232,10 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512") + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") + endif() message(STATUS "Enabling AVX512 in tests/examples") endif() @@ -254,6 +257,12 @@ if(NOT MSVC) message(STATUS "Enabling VSX in tests/examples") endif() + option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF) + if(EIGEN_TEST_MSA) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa") + message(STATUS "Enabling MSA in tests/examples") + endif() + option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) if(EIGEN_TEST_NEON) if(EIGEN_TEST_FMA) @@ -271,12 +280,18 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() - option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) - if(EIGEN_TEST_ZVECTOR) + option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z13) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") endif() + option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z14) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") + message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") + endif() + check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF) @@ -363,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") -include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR if(EIGEN_INCLUDE_INSTALL_DIR) @@ -437,10 +452,17 @@ endif() # add SYCL option(EIGEN_TEST_SYCL "Add Sycl support." OFF) +option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF) if(EIGEN_TEST_SYCL) set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") - include(FindComputeCpp) -endif() + if(EIGEN_SYCL_TRISYCL) + message(STATUS "Using triSYCL") + include(FindTriSYCL) + else(EIGEN_SYCL_TRISYCL) + message(STATUS "Using ComputeCPP SYCL") + include(FindComputeCpp) + endif(EIGEN_SYCL_TRISYCL) +endif(EIGEN_TEST_SYCL) add_subdirectory(unsupported) @@ -516,6 +538,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0) # Imported target support add_library (eigen INTERFACE) + add_library (Eigen3::Eigen ALIAS eigen) target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) target_include_directories (eigen INTERFACE diff --git a/CTestConfig.cmake b/CTestConfig.cmake index 4c0027824..8b4cd798e 100644 --- a/CTestConfig.cmake +++ b/CTestConfig.cmake @@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http") set(CTEST_DROP_SITE "manao.inria.fr") set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen") set(CTEST_DROP_SITE_CDASH TRUE) -set(CTEST_PROJECT_SUBPROJECTS -Official -Unsupported -) +#set(CTEST_PROJECT_SUBPROJECTS +#Official +#Unsupported +#) diff --git a/CTestCustom.cmake.in b/CTestCustom.cmake.in index 9fed9d327..89e487f05 100644 --- a/CTestCustom.cmake.in +++ b/CTestCustom.cmake.in @@ -1,3 +1,4 @@ set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000") set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000") +list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@) diff --git a/Eigen/Cholesky b/Eigen/Cholesky index 369d1f5ec..1332b540d 100644 --- a/Eigen/Cholesky +++ b/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" @@ -31,7 +32,11 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/Eigen/Core b/Eigen/Core index 884546a2b..7347a2480 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -14,61 +14,26 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" -// Handle NVCC/CUDA/SYCL -#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) - // Do not try asserts on CUDA and SYCL! - #ifndef EIGEN_NO_DEBUG - #define EIGEN_NO_DEBUG - #endif +// then include this file where all our macros are defined. It's really important to do it first because +// it's where we do all the compiler/OS/arch detections and define most defaults. +#include "src/Core/util/Macros.h" - #ifdef EIGEN_INTERNAL_DEBUGGING - #undef EIGEN_INTERNAL_DEBUGGING - #endif +// This detects SSE/AVX/NEON/etc. and configure alignment settings +#include "src/Core/util/ConfigureVectorization.h" - #ifdef EIGEN_EXCEPTIONS - #undef EIGEN_EXCEPTIONS - #endif - - // All functions callable from CUDA code must be qualified with __device__ - #ifdef __CUDACC__ - // Do not try to vectorize on CUDA and SYCL! - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - - #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro - // works properly on the device side - #include - #else - #define EIGEN_DEVICE_FUNC - #endif -#else - #define EIGEN_DEVICE_FUNC +// We need cuda_runtime.h/hip_runtime.h to ensure that +// the EIGEN_USING_STD_MATH macro works properly on the device side +#if defined(EIGEN_CUDACC) + #include +#elif defined(EIGEN_HIPCC) + #include #endif -// When compiling CUDA device code with NVCC, pull in math functions from the -// global namespace. In host mode, and when device doee with clang, use the -// std versions. -#if defined(__CUDA_ARCH__) && defined(__NVCC__) - #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; -#endif - -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) - #define EIGEN_EXCEPTIONS -#endif #ifdef EIGEN_EXCEPTIONS #include #endif -// then include this file where all our macros are defined. It's really important to do it first because -// it's where we do all the alignment settings (platform detection and honoring the user's will if he -// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization. -#include "src/Core/util/Macros.h" - // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) @@ -81,169 +46,9 @@ // and inclusion of their respective header files #include "src/Core/util/MKL_support.h" -// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into -// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks -#if EIGEN_MAX_ALIGN_BYTES==0 - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif -#if EIGEN_COMP_MSVC - #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // Remember that usage of defined() in a #define is undefined by the standard. - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif - #endif -#else - // Remember that usage of defined() in a #define is undefined by the standard - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) - #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC - #endif -#endif - -#ifndef EIGEN_DONT_VECTORIZE - - #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) - - // Defines symbols for compile-time detection of which instructions are - // used. - // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_SSE - #define EIGEN_VECTORIZE_SSE2 - - // Detect sse3/ssse3/sse4: - // gcc and icc defines __SSE3__, ... - // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you - // want to force the use of those instructions with msvc. - #ifdef __SSE3__ - #define EIGEN_VECTORIZE_SSE3 - #endif - #ifdef __SSSE3__ - #define EIGEN_VECTORIZE_SSSE3 - #endif - #ifdef __SSE4_1__ - #define EIGEN_VECTORIZE_SSE4_1 - #endif - #ifdef __SSE4_2__ - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX__ - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX2__ - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __FMA__ - #define EIGEN_VECTORIZE_FMA - #endif - #if defined(__AVX512F__) - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #define EIGEN_VECTORIZE_FMA - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #endif - - // include files - - // This extern "C" works around a MINGW-w64 compilation issue - // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 - // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). - // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations - // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; - // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. - // notice that since these are C headers, the extern "C" is theoretically needed anyways. - extern "C" { - // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. - // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 - #include - #else - #include - #include - #include - #ifdef EIGEN_VECTORIZE_SSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_1 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_2 - #include - #endif - #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) - #include - #endif - #endif - } // end extern "C" - #elif defined __VSX__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif defined __ALTIVEC__ - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ALTIVEC - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - #elif (defined __ARM_NEON) || (defined __ARM_NEON__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_NEON - #include - #elif (defined __s390x__ && defined __VEC__) - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ZVECTOR - #include - #endif -#endif - -#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) - // We can use the optimized fp16 to float and float to fp16 conversion routines - #define EIGEN_HAS_FP16_C -#endif - -#if defined __CUDACC__ - #define EIGEN_VECTORIZE_CUDA - #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 - #define EIGEN_HAS_CUDA_FP16 - #endif -#endif - -#if defined EIGEN_HAS_CUDA_FP16 - #include - #include +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -275,6 +80,10 @@ // for min/max: #include +#if EIGEN_HAS_CXX11 +#include +#endif + // for std::is_nothrow_move_assignable #ifdef EIGEN_INCLUDE_TYPE_TRAITS #include @@ -299,38 +108,6 @@ #include #endif -/** \brief Namespace containing all symbols from the %Eigen library. */ -namespace Eigen { - -inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_AVX512) - return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_AVX) - return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_2) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_1) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; -#elif defined(EIGEN_VECTORIZE_SSSE3) - return "SSE, SSE2, SSE3, SSSE3"; -#elif defined(EIGEN_VECTORIZE_SSE3) - return "SSE, SSE2, SSE3"; -#elif defined(EIGEN_VECTORIZE_SSE2) - return "SSE, SSE2"; -#elif defined(EIGEN_VECTORIZE_ALTIVEC) - return "AltiVec"; -#elif defined(EIGEN_VECTORIZE_VSX) - return "VSX"; -#elif defined(EIGEN_VECTORIZE_NEON) - return "ARM NEON"; -#elif defined(EIGEN_VECTORIZE_ZVECTOR) - return "S390X ZVECTOR"; -#else - return "None"; -#endif -} - -} // end namespace Eigen #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT // This will generate an error message: @@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) { namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to +// we use size_t frequently and we'll never remember to prepend it with std:: every time just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t @@ -366,11 +143,11 @@ using std::ptrdiff_t; #include "src/Core/util/IntegralConstant.h" #include "src/Core/util/SymbolicIndex.h" - #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" +#include "src/Core/arch/Default/ConjHelper.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" @@ -388,6 +165,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" @@ -401,22 +179,33 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/PacketMath.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" + #include "src/Core/arch/NEON/TypeCasting.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" #include "src/Core/arch/ZVector/Complex.h" +#elif defined EIGEN_VECTORIZE_MSA + #include "src/Core/arch/MSA/PacketMath.h" + #include "src/Core/arch/MSA/MathFunctions.h" + #include "src/Core/arch/MSA/Complex.h" #endif // Half float support -#include "src/Core/arch/CUDA/Half.h" -#include "src/Core/arch/CUDA/PacketMathHalf.h" -#include "src/Core/arch/CUDA/TypeCasting.h" +#include "src/Core/arch/GPU/Half.h" +#include "src/Core/arch/GPU/PacketMathHalf.h" +#include "src/Core/arch/GPU/TypeCasting.h" -#if defined EIGEN_VECTORIZE_CUDA - #include "src/Core/arch/CUDA/PacketMath.h" - #include "src/Core/arch/CUDA/MathFunctions.h" +#if defined EIGEN_VECTORIZE_GPU + #include "src/Core/arch/GPU/PacketMath.h" + #include "src/Core/arch/GPU/MathFunctions.h" #endif +#if defined EIGEN_VECTORIZE_SYCL + #include "src/Core/arch/SYCL/InteropHeaders.h" + #include "src/Core/arch/SYCL/PacketMath.h" + #include "src/Core/arch/SYCL/MathFunctions.h" + #include "src/Core/arch/SYCL/TypeCasting.h" +#endif #include "src/Core/arch/Default/Settings.h" #include "src/Core/functors/TernaryFunctors.h" @@ -428,7 +217,9 @@ using std::ptrdiff_t; // Specialized functors to enable the processing of complex numbers // on CUDA devices +#ifdef EIGEN_CUDACC #include "src/Core/arch/CUDA/Complex.h" +#endif #include "src/Core/util/IndexedViewHelper.h" #include "src/Core/util/ReshapedHelper.h" diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues index 009e529e1..7d6ac787b 100644 --- a/Eigen/Eigenvalues +++ b/Eigen/Eigenvalues @@ -10,14 +10,14 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" #include "LU" #include "Geometry" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Eigenvalues_Module Eigenvalues module * * @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" diff --git a/Eigen/Geometry b/Eigen/Geometry index 131a4edfc..04aa316cb 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "SVD" #include "LU" #include +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Geometry_Module Geometry module * * This module provides support for: diff --git a/Eigen/KLUSupport b/Eigen/KLUSupport new file mode 100644 index 000000000..b23d90535 --- /dev/null +++ b/Eigen/KLUSupport @@ -0,0 +1,41 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_MODULE_H +#define EIGEN_KLUSUPPORT_MODULE_H + +#include + +#include + +extern "C" { +#include +#include + } + +/** \ingroup Support_modules + * \defgroup KLUSupport_Module KLUSupport module + * + * This module provides an interface to the KLU library which is part of the suitesparse package. + * It provides the following factorization class: + * - class KLU: a sparse LU factorization, well-suited for circuit simulation. + * + * \code + * #include + * \endcode + * + * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. + * The dependencies depend on how umfpack has been compiled. + * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. + * + */ + +#include "src/KLUSupport/KLUSupport.h" + +#include + +#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/Eigen/LU b/Eigen/LU index 6f6c55629..6418a86e1 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -28,7 +28,11 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/Eigen/PaStiXSupport b/Eigen/PaStiXSupport index de3a63b4d..234619acc 100644 --- a/Eigen/PaStiXSupport +++ b/Eigen/PaStiXSupport @@ -36,6 +36,7 @@ extern "C" { * \endcode * * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. + * This wrapper resuires PaStiX version 5.x compiled without MPI support. * The dependencies depend on how PaSTiX has been compiled. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * diff --git a/Eigen/PardisoSupport b/Eigen/PardisoSupport old mode 100755 new mode 100644 diff --git a/Eigen/QR b/Eigen/QR index 80838e3bd..1be1863a1 100644 --- a/Eigen/QR +++ b/Eigen/QR @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup QR_Module QR module * * @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc index c6571f129..4f07df02a 100644 --- a/Eigen/QtAlignedMalloc +++ b/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/Eigen/SVD b/Eigen/SVD index 86143c23d..5d0e75f7f 100644 --- a/Eigen/SVD +++ b/Eigen/SVD @@ -37,7 +37,11 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif diff --git a/Eigen/SparseLU b/Eigen/SparseLU index 38b38b531..37c4a5c5a 100644 --- a/Eigen/SparseLU +++ b/Eigen/SparseLU @@ -23,6 +23,8 @@ // Ordering interface #include "OrderingMethods" +#include "src/Core/util/DisableStupidWarnings.h" + #include "src/SparseLU/SparseLU_gemm_kernel.h" #include "src/SparseLU/SparseLU_Structs.h" @@ -43,4 +45,6 @@ #include "src/SparseLU/SparseLU_Utils.h" #include "src/SparseLU/SparseLU.h" +#include "src/Core/util/ReenableStupidWarnings.h" + #endif // EIGEN_SPARSELU_MODULE_H diff --git a/Eigen/SparseQR b/Eigen/SparseQR index a6f3b7f7d..f5fc5fa7f 100644 --- a/Eigen/SparseQR +++ b/Eigen/SparseQR @@ -28,7 +28,6 @@ * */ -#include "OrderingMethods" #include "src/SparseCore/SparseColEtree.h" #include "src/SparseQR/SparseQR.h" diff --git a/Eigen/src/Cholesky/LDLT.h b/Eigen/src/Cholesky/LDLT.h index fcee7b2e3..2dfeac333 100644 --- a/Eigen/src/Cholesky/LDLT.h +++ b/Eigen/src/Cholesky/LDLT.h @@ -247,8 +247,8 @@ template class LDLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \returns \c Success if computation was successful, + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { @@ -258,7 +258,6 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif @@ -376,6 +375,8 @@ template<> struct ldlt_inplace if((rs>0) && pivot_is_valid) A21 /= realAkk; + else if(rs>0) + ret = ret && (A21.array()==Scalar(0)).all(); if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed else if(!pivot_is_valid) found_zero_pivot = true; @@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); - // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon - // as motivated by LAPACK's xGELSS: + // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min()) + // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS: // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits::epsilon(),RealScalar(1) / NumTraits::highest()); // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. - RealScalar tolerance = RealScalar(1) / NumTraits::highest(); + // Using numeric_limits::min() gives us more robustness to denormals. + RealScalar tolerance = (std::numeric_limits::min)(); for (Index i = 0; i < vecD.size(); ++i) { diff --git a/Eigen/src/Cholesky/LLT.h b/Eigen/src/Cholesky/LLT.h index 87ca8d423..868766365 100644 --- a/Eigen/src/Cholesky/LLT.h +++ b/Eigen/src/Cholesky/LLT.h @@ -24,7 +24,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -41,14 +41,18 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT { public: @@ -96,7 +100,7 @@ template class LLT compute(matrix.derived()); } - /** \brief Constructs a LDLT factorization from a given matrix + /** \brief Constructs a LLT factorization from a given matrix * * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * \c MatrixType is a Eigen::Ref. @@ -146,7 +150,7 @@ template class LLT } template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -176,8 +180,8 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \returns \c Success if computation was successful, + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { @@ -196,11 +200,10 @@ template class LLT inline Index cols() const { return m_matrix.cols(); } template - LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); + LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif @@ -425,7 +428,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); @@ -454,7 +458,7 @@ LLT& LLT::compute(const EigenBase */ template template -LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); diff --git a/Eigen/src/CholmodSupport/CholmodSupport.h b/Eigen/src/CholmodSupport/CholmodSupport.h index 61faf43ba..adaf52858 100644 --- a/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/Eigen/src/CholmodSupport/CholmodSupport.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CHOLMODSUPPORT_H #define EIGEN_CHOLMODSUPPORT_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref > res.dtype = 0; res.stype = -1; - + if (internal::is_same<_StorageIndex,int>::value) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,long>::value) + else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) { res.itype = CHOLMOD_LONG; } @@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref > // setup res.xtype internal::cholmod_configure_matrix<_Scalar>::run(res); - + res.stype = 0; - + return res; } @@ -121,7 +121,7 @@ template cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) { cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); - + if(UpLo==Upper) res.stype = 1; if(UpLo==Lower) res.stype = -1; // swap stype for rowmajor matrices (only works for real matrices) @@ -167,12 +167,12 @@ namespace internal { // template specializations for int and long that call the correct cholmod method #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ - template ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ - template<> ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } + template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ + template<> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ - template ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ - template<> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } + template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ + template<> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } EIGEN_CHOLMOD_SPECIALIZE0(int, start) EIGEN_CHOLMOD_SPECIALIZE0(int, finish) @@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) -template cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } -template<> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } +template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } +template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } -template cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } -template<> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } +template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } +template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } template -int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } template<> -int cm_factorize_p (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } +inline int cm_factorize_p (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } #undef EIGEN_CHOLMOD_SPECIALIZE0 #undef EIGEN_CHOLMOD_SPECIALIZE1 @@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase internal::cm_free_factor(m_cholmodFactor, m_cholmod); internal::cm_finish(m_cholmod); } - + inline StorageIndex cols() const { return internal::convert_index(m_cholmodFactor->n); } inline StorageIndex rows() const { return internal::convert_index(m_cholmodFactor->n); } - + /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was successful, @@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase factorize(matrix); return derived(); } - + /** Performs a symbolic decomposition on the sparsity pattern of \a matrix. * * This function is particularly useful when solving for several problems having the same structure. - * + * * \sa factorize() */ void analyzePattern(const MatrixType& matrix) @@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase } cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); m_cholmodFactor = internal::cm_analyze(A, m_cholmod); - + this->m_isInitialized = true; this->m_info = Success; m_analysisIsOk = true; m_factorizationIsOk = false; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. @@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue); m_factorizationIsOk = true; } - + /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations. * See the Cholmod user guide for details. */ cholmod_common& cholmod() { return m_cholmod; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal */ template @@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); - + // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref. Ref > b_ref(b.derived()); @@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase dest = Matrix::Map(reinterpret_cast(x_cd->x),b.rows(),b.cols()); internal::cm_free_dense(x_cd, m_cholmod); } - + /** \internal */ template void _solve_impl(const SparseMatrixBase &b, SparseMatrixBase &dest) const @@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase internal::cm_free_sparse(x_cs, m_cholmod); } #endif // EIGEN_PARSED_BY_DOXYGEN - - + + /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization. * * During the numerical factorization, an offset term is added to the diagonal coefficients:\n @@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase m_shiftOffset[0] = double(offset); return derived(); } - + /** \returns the determinant of the underlying matrix from the current factorization */ Scalar determinant() const { @@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase template void dumpMemory(Stream& /*s*/) {} - + protected: mutable cholmod_common m_cholmod; cholmod_factor* m_cholmodFactor; @@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLLT() : Base() { init(); } CholmodSimplicialLLT(const MatrixType& matrix) : Base() @@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLDLT() : Base() { init(); } CholmodSimplicialLDLT(const MatrixType& matrix) : Base() @@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper { typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSupernodalLLT() : Base() { init(); } CholmodSupernodalLLT(const MatrixType& matrix) : Base() @@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom { typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodDecomposition() : Base() { init(); } CholmodDecomposition(const MatrixType& matrix) : Base() @@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom } ~CholmodDecomposition() {} - + void setMode(CholmodMode mode) { switch(mode) diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h index ada1571f1..db6da0001 100644 --- a/Eigen/src/Core/ArithmeticSequence.h +++ b/Eigen/src/Core/ArithmeticSequence.h @@ -29,17 +29,17 @@ template struct aseq_negate > { template<> struct aseq_negate > {}; template::value, - bool SizeIsSymbolic =Symbolic::is_symbolic::value> + bool FirstIsSymbolic=symbolic::is_symbolic::value, + bool SizeIsSymbolic =symbolic::is_symbolic::value> struct aseq_reverse_first_type { typedef Index type; }; template struct aseq_reverse_first_type { - typedef Symbolic::AddExpr > >, - Symbolic::ValueExpr > + typedef symbolic::AddExpr > >, + symbolic::ValueExpr > > type; }; @@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux struct aseq_reverse_first_type { typedef typename aseq_reverse_first_type_aux::type Aux; - typedef Symbolic::AddExpr > type; + typedef symbolic::AddExpr > type; }; template struct aseq_reverse_first_type { - typedef Symbolic::AddExpr > >, - Symbolic::ValueExpr >, - Symbolic::ValueExpr<> > type; + typedef symbolic::AddExpr > >, + symbolic::ValueExpr >, + symbolic::ValueExpr<> > type; }; #endif @@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr) -typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), CleanedIncrType(incr)); } -#else + +#else // EIGEN_HAS_CXX11 template -typename internal::enable_if::value || Symbolic::is_symbolic::value), +typename internal::enable_if::value || symbolic::is_symbolic::value), ArithmeticSequence::type,Index> >::type seq(FirstType f, LastType l) { @@ -237,35 +238,35 @@ seq(FirstType f, LastType l) } template -typename internal::enable_if::value, - ArithmeticSequence,Symbolic::ValueExpr<> >, - Symbolic::ValueExpr > > > >::type -seq(const Symbolic::BaseExpr &f, LastType l) +typename internal::enable_if::value, + ArithmeticSequence,symbolic::ValueExpr<> >, + symbolic::ValueExpr > > > >::type +seq(const symbolic::BaseExpr &f, LastType l) { return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); } template -typename internal::enable_if::value, +typename internal::enable_if::value, ArithmeticSequence::type, - Symbolic::AddExpr >, - Symbolic::ValueExpr > > > >::type -seq(FirstType f, const Symbolic::BaseExpr &l) + symbolic::AddExpr >, + symbolic::ValueExpr > > > >::type +seq(FirstType f, const symbolic::BaseExpr &l) { return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); } template ArithmeticSequence >,Symbolic::ValueExpr > > > -seq(const Symbolic::BaseExpr &f, const Symbolic::BaseExpr &l) + symbolic::AddExpr >,symbolic::ValueExpr > > > +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) { return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); } template -typename internal::enable_if::value || Symbolic::is_symbolic::value), +typename internal::enable_if::value || symbolic::is_symbolic::value), ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type seq(FirstType f, LastType l, IncrType incr) { @@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr) } template -typename internal::enable_if::value, +typename internal::enable_if::value, ArithmeticSequence, - Symbolic::ValueExpr<> >, - Symbolic::ValueExpr::type> >, - Symbolic::ValueExpr::type> >, + symbolic::QuotientExpr, + symbolic::ValueExpr<> >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, typename internal::cleanup_seq_incr::type> >::type -seq(const Symbolic::BaseExpr &f, LastType l, IncrType incr) +seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) { typedef typename internal::cleanup_seq_incr::type CleanedIncrType; return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); } template -typename internal::enable_if::value, +typename internal::enable_if::value, ArithmeticSequence::type, - Symbolic::QuotientExpr >, - Symbolic::ValueExpr::type> >, - Symbolic::ValueExpr::type> >, + symbolic::QuotientExpr >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, typename internal::cleanup_seq_incr::type> >::type -seq(FirstType f, const Symbolic::BaseExpr &l, IncrType incr) +seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) { typedef typename internal::cleanup_seq_incr::type CleanedIncrType; return seqN(typename internal::cleanup_index_type::type(f), @@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr &l, IncrType incr) template ArithmeticSequence >, - Symbolic::ValueExpr::type> >, - Symbolic::ValueExpr::type> >, + symbolic::QuotientExpr >, + symbolic::ValueExpr::type> >, + symbolic::ValueExpr::type> >, typename internal::cleanup_seq_incr::type> -seq(const Symbolic::BaseExpr &f, const Symbolic::BaseExpr &l, IncrType incr) +seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) { typedef typename internal::cleanup_seq_incr::type CleanedIncrType; return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); } -#endif +#endif // EIGEN_HAS_CXX11 #endif // EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX11 +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. + * + * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode + * + * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +auto lastN(SizeType size, IncrType incr) +-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +{ + return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); +} + +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. + * + * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode + * + * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ +template +auto lastN(SizeType size) +-> decltype(seqN(Eigen::last+fix<1>()-size, size)) +{ + return seqN(Eigen::last+fix<1>()-size, size); +} +#endif + namespace internal { // Convert a symbolic span into a usable one (i.e., remove last/end "keywords") template struct make_size_type { - typedef typename internal::conditional::value, Index, T>::type type; + typedef typename internal::conditional::value, Index, T>::type type; }; template @@ -345,6 +375,39 @@ struct get_compile_time_incr > { } // end namespace internal +/** \namespace Eigen::indexing + * \ingroup Core_Module + * + * The sole purpose of this namespace is to be able to import all functions + * and symbols that are expected to be used within operator() for indexing + * and slicing. If you already imported the whole Eigen namespace: + * \code using namespace Eigen; \endcode + * then you are already all set. Otherwise, if you don't want/cannot import + * the whole Eigen namespace, the following line: + * \code using namespace Eigen::indexing; \endcode + * is equivalent to: + * \code + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + using Eigen::lastN; // c++11 only + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; + \endcode + */ +namespace indexing { + using Eigen::all; + using Eigen::seq; + using Eigen::seqN; + #if EIGEN_HAS_CXX11 + using Eigen::lastN; + #endif + using Eigen::last; + using Eigen::lastp1; + using Eigen::fix; +} + } // end namespace Eigen #endif // EIGEN_ARITHMETIC_SEQUENCE_H diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 0d34269fd..e10020d4f 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -231,10 +231,16 @@ class Array : Base(other) { } + private: + struct PrivateType {}; + public: + /** \sa MatrixBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const EigenBase &other) + EIGEN_STRONG_INLINE Array(const EigenBase &other, + typename internal::enable_if::value, + PrivateType>::type = PrivateType()) : Base(other.derived()) { } diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index af5fb2566..9da960f08 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -175,7 +175,7 @@ template class ArrayBase */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator-=(const ArrayBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -188,7 +188,7 @@ ArrayBase::operator-=(const ArrayBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator+=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -201,7 +201,7 @@ ArrayBase::operator+=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator*=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::mul_assign_op()); @@ -214,7 +214,7 @@ ArrayBase::operator*=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator/=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::div_assign_op()); diff --git a/Eigen/src/Core/ArrayWrapper.h b/Eigen/src/Core/ArrayWrapper.h index a04521a16..757b31825 100644 --- a/Eigen/src/Core/ArrayWrapper.h +++ b/Eigen/src/Core/ArrayWrapper.h @@ -32,7 +32,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } @@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - const typename internal::remove_all::type& EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& nestedExpression() const { return m_expression; @@ -129,7 +130,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } diff --git a/Eigen/src/Core/Assign.h b/Eigen/src/Core/Assign.h index 53806ba33..655412efd 100644 --- a/Eigen/src/Core/Assign.h +++ b/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index b0ec7b7ca..362d905d2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -39,7 +39,7 @@ public: enum { DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, - DstHasDirectAccess = DstFlags & DirectAccessBit, + DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) }; @@ -83,7 +83,7 @@ private: && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ @@ -97,7 +97,7 @@ private: public: enum { - Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) + Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) @@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignement kind defined in this file: +// Assignment kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; @@ -899,7 +899,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instanciation if not needed + // NOTE The following two functions are templated to avoid their instantiation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/Eigen/src/Core/Assign_MKL.h b/Eigen/src/Core/Assign_MKL.h index 6c2ab9264..6866095bf 100755 --- a/Eigen/src/Core/Assign_MKL.h +++ b/Eigen/src/Core/Assign_MKL.h @@ -84,7 +84,8 @@ class vml_assign_traits struct Assignment, SrcXprNested>, assign_op, \ Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ @@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ VMLTYPE exponent = reinterpret_cast(src.rhs().functor().m_other); \ if(vml_assign_traits::Traversal==LinearTraversal) \ diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index ed607d5d8..ccf519067 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -76,7 +76,7 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -inline bool DenseBase::all() const +EIGEN_DEVICE_FUNC inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { @@ -100,7 +100,7 @@ inline bool DenseBase::all() const * \sa all() */ template -inline bool DenseBase::any() const +EIGEN_DEVICE_FUNC inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { @@ -124,7 +124,7 @@ inline bool DenseBase::any() const * \sa all(), any() */ template -inline Eigen::Index DenseBase::count() const +EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/Eigen/src/Core/CommaInitializer.h b/Eigen/src/Core/CommaInitializer.h index d218e9814..35fdbb819 100644 --- a/Eigen/src/Core/CommaInitializer.h +++ b/Eigen/src/Core/CommaInitializer.h @@ -141,7 +141,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -inline CommaInitializer DenseBase::operator<< (const Scalar& s) +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -149,7 +149,7 @@ inline CommaInitializer DenseBase::operator<< (const Scalar& s /** \sa operator<<(const Scalar&) */ template template -inline CommaInitializer +EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 412f5a661..264446f65 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -134,19 +134,21 @@ private: // this helper permits to completely eliminate m_outerStride if it is known at compiletime. template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { - EIGEN_ONLY_USED_FOR_DEBUG(outerStride); +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif eigen_internal_assert(outerStride==OuterStride); } - Index outerStride() const { return OuterStride; } + EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } const Scalar *data; }; template class plainobjectbase_evaluator_data { public: - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - Index outerStride() const { return m_outerStride; } + EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} + EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; } const Scalar *data; protected: Index m_outerStride; @@ -1034,7 +1036,7 @@ struct evaluator > OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, @@ -1044,7 +1046,9 @@ struct evaluator > Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, PacketAlignment = unpacket_traits::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; @@ -1075,14 +1079,16 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), - m_startCol(block.startCol()) + m_startCol(block.startCol()), + m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0) { } typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { - RowsAtCompileTime = XprType::RowsAtCompileTime + RowsAtCompileTime = XprType::RowsAtCompileTime, + ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1094,7 +1100,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeff(m_linear_offset.value() + index); + else + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1106,7 +1115,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeffRef(m_linear_offset.value() + index); + else + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } template @@ -1120,8 +1132,11 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return packet(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.template packet(m_linear_offset.value() + index); + else + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); } template @@ -1135,15 +1150,19 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return writePacket(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0, - x); + if (ForwardLinearAccess) + return m_argImpl.template writePacket(m_linear_offset.value() + index, x); + else + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); } protected: evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index a36765e39..bf2632d9e 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -158,7 +158,7 @@ public: */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -171,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -181,4 +181,3 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H - diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index dd498f758..b1923da0f 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -131,7 +131,7 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -150,7 +150,7 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -170,7 +170,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); @@ -192,7 +192,7 @@ DenseBase::Constant(Index rows, Index cols, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index size, const Scalar& value) { return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); @@ -208,7 +208,7 @@ DenseBase::Constant(Index size, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(const Scalar& value) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -220,7 +220,7 @@ DenseBase::Constant(const Scalar& value) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -232,7 +232,7 @@ DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const * \sa LinSpaced(Scalar,Scalar) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -264,7 +264,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -276,7 +276,7 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) * Special version for fixed size types which does not require the size parameter. */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -286,7 +286,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template -bool DenseBase::isApproxToConstant +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); @@ -301,7 +301,7 @@ bool DenseBase::isApproxToConstant * * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ template -bool DenseBase::isConstant +EIGEN_DEVICE_FUNC bool DenseBase::isConstant (const Scalar& val, const RealScalar& prec) const { return isApproxToConstant(val, prec); @@ -312,7 +312,7 @@ bool DenseBase::isConstant * \sa setConstant(), Constant(), class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) { setConstant(val); } @@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) { return derived() = Constant(rows(), cols(), val); } @@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index size, const Scalar& val) { resize(size); @@ -356,7 +356,7 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) { resize(rows, cols); @@ -380,7 +380,7 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); @@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return setLinSpaced(size(), low, high); @@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, * \sa Zero(), Zero(Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index rows, Index cols) { return Constant(rows, cols, Scalar(0)); @@ -446,7 +446,7 @@ DenseBase::Zero(Index rows, Index cols) * \sa Zero(), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index size) { return Constant(size, Scalar(0)); @@ -463,7 +463,7 @@ DenseBase::Zero(Index size) * \sa Zero(Index), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero() { return Constant(Scalar(0)); @@ -478,7 +478,7 @@ DenseBase::Zero() * \sa class CwiseNullaryOp, Zero() */ template -bool DenseBase::isZero(const RealScalar& prec) const +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); for(Index j = 0; j < cols(); ++j) @@ -496,7 +496,7 @@ bool DenseBase::isZero(const RealScalar& prec) const * \sa class CwiseNullaryOp, Zero() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() { return setConstant(Scalar(0)); } @@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setZero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index newSize) { resize(newSize); @@ -529,7 +529,7 @@ PlainObjectBase::setZero(Index newSize) * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index rows, Index cols) { resize(rows, cols); @@ -553,7 +553,7 @@ PlainObjectBase::setZero(Index rows, Index cols) * \sa Ones(), Ones(Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index rows, Index cols) { return Constant(rows, cols, Scalar(1)); @@ -576,7 +576,7 @@ DenseBase::Ones(Index rows, Index cols) * \sa Ones(), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index newSize) { return Constant(newSize, Scalar(1)); @@ -593,7 +593,7 @@ DenseBase::Ones(Index newSize) * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones() { return Constant(Scalar(1)); @@ -608,7 +608,7 @@ DenseBase::Ones() * \sa class CwiseNullaryOp, Ones() */ template -bool DenseBase::isOnes +EIGEN_DEVICE_FUNC bool DenseBase::isOnes (const RealScalar& prec) const { return isApproxToConstant(Scalar(1), prec); @@ -622,7 +622,7 @@ bool DenseBase::isOnes * \sa class CwiseNullaryOp, Ones() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() { return setConstant(Scalar(1)); } @@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index newSize) { resize(newSize); @@ -655,7 +655,7 @@ PlainObjectBase::setOnes(Index newSize) * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index rows, Index cols) { resize(rows, cols); @@ -679,7 +679,7 @@ PlainObjectBase::setOnes(Index rows, Index cols) * \sa Identity(), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity(Index rows, Index cols) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); @@ -696,7 +696,7 @@ MatrixBase::Identity(Index rows, Index cols) * \sa Identity(Index,Index), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity() { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -771,7 +771,7 @@ struct setIdentity_impl * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() { return internal::setIdentity_impl::run(derived()); } @@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) { derived().resize(rows, cols); return setIdentity(); @@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); @@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(),i); @@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() { return Derived::Unit(0); } /** \returns an expression of the Y axis unit vector (0,1{,0}^*) @@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() { return Derived::Unit(1); } /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) @@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() { return Derived::Unit(2); } /** \returns an expression of the W axis unit vector (0,0,0,1) @@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } +/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector + * + * \param i index of the unique coefficient to be set to 1 + * + * \only_for_vectors + * + * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) +{ + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + eigen_assert(i class DenseBase * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -296,7 +301,7 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue& func); - /** \ínternal + /** \internal * Copies \a other into *this without evaluating other. \returns a reference to *this. * \deprecated */ template @@ -395,7 +400,7 @@ template class DenseBase * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. * - * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const @@ -484,9 +489,9 @@ template class DenseBase return derived().coeff(0,0); } - bool all() const; - bool any() const; - Index count() const; + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; typedef VectorwiseOp RowwiseReturnType; typedef const VectorwiseOp ConstRowwiseReturnType; diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 7958feeb9..3c02a1025 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -61,7 +61,7 @@ struct plain_array #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. + // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template @@ -207,7 +207,9 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data, other.m_data); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} @@ -267,7 +269,11 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } + { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) @@ -459,14 +475,16 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_cols,other.m_cols); + } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) @@ -533,14 +551,16 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { + numext::swap(m_data,other.m_data); + numext::swap(m_rows,other.m_rows); + } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 49e711257..563135fb2 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -70,7 +70,10 @@ template class Diagonal EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DEVICE_FUNC - explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} + explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) + { + eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() ); + } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) @@ -184,7 +187,7 @@ template class Diagonal * * \sa class Diagonal */ template -inline typename MatrixBase::DiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -192,7 +195,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -210,7 +213,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -218,7 +221,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -237,6 +240,7 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -246,6 +250,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template +EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/Eigen/src/Core/DiagonalMatrix.h b/Eigen/src/Core/DiagonalMatrix.h index ecfdce8ef..4e8297ee6 100644 --- a/Eigen/src/Core/DiagonalMatrix.h +++ b/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -273,7 +273,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -inline const DiagonalWrapper +EIGEN_DEVICE_FUNC inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/Eigen/src/Core/DiagonalProduct.h b/Eigen/src/Core/DiagonalProduct.h index d372b938f..7911d1cd1 100644 --- a/Eigen/src/Core/DiagonalProduct.h +++ b/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -inline const Product +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 06ef18b8b..11da432b2 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -31,7 +31,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.template binaryExpr(b).sum(); } @@ -43,7 +44,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.transpose().template binaryExpr(b).sum(); } @@ -65,6 +67,7 @@ struct dot_nocheck template template EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType MatrixBase::dot(const MatrixBase& other) const { @@ -90,7 +93,7 @@ MatrixBase::dot(const MatrixBase& other) const * \sa dot(), norm(), lpNorm() */ template -EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +120,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +163,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -257,9 +260,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -inline typename NumTraits::Scalar>::Real +EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real #else -MatrixBase::RealScalar +EIGEN_DEVICE_FUNC MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { diff --git a/Eigen/src/Core/EigenBase.h b/Eigen/src/Core/EigenBase.h index f76995af9..b195506a9 100644 --- a/Eigen/src/Core/EigenBase.h +++ b/Eigen/src/Core/EigenBase.h @@ -14,6 +14,7 @@ namespace Eigen { /** \class EigenBase + * \ingroup Core_Module * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * @@ -128,6 +129,7 @@ template struct EigenBase */ template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const EigenBase &other) { call_assignment(derived(), other.derived()); @@ -136,6 +138,7 @@ Derived& DenseBase::operator=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator+=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -144,6 +147,7 @@ Derived& DenseBase::operator+=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator-=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); diff --git a/Eigen/src/Core/Fuzzy.h b/Eigen/src/Core/Fuzzy.h index 3e403a09d..43aa49b2b 100644 --- a/Eigen/src/Core/Fuzzy.h +++ b/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -bool DenseBase::isApprox( +EIGEN_DEVICE_FUNC bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ bool DenseBase::isMuchSmallerThan( */ template template -bool DenseBase::isMuchSmallerThan( +EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 0f16cd8e3..43f3b84c8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -18,18 +18,33 @@ enum { Small = 3 }; +// Define the threshold value to fallback from the generic matrix-matrix product +// implementation (heavy) to the lightweight coeff-based product one. +// See generic_product_impl +// in products/GeneralMatrixMatrix.h for more details. +// TODO This threshold should also be used in the compile-time selector below. +#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD +// This default value has been obtained on a Haswell architecture. +#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 +#endif + namespace internal { template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_GPU_COMPILE_PHASE + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -148,13 +163,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE Scalar* data() { return 0; } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } }; template @@ -379,10 +394,9 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ - template template +EIGEN_DEVICE_FUNC inline const Product MatrixBase::operator*(const MatrixBase &other) const { @@ -412,8 +426,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be @@ -428,7 +440,7 @@ MatrixBase::operator*(const MatrixBase &other) const template template const Product -MatrixBase::lazyProduct(const MatrixBase &other) const +EIGEN_DEVICE_FUNC MatrixBase::lazyProduct(const MatrixBase &other) const { enum { ProductIsValid = Derived::ColsAtCompileTime==Dynamic diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index ac5552d3e..b67c41d8a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -82,7 +82,11 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, + HasI0e = 0, + HasI1e = 0, HasIGamma = 0, + HasIGammaDerA = 0, + HasGammaSampleDerAlpha = 0, HasIGammac = 0, HasBetaInc = 0, @@ -231,7 +235,7 @@ pload1(const typename unpacket_traits::type *a) { return pset1( * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. */ -template EIGEN_DEVICE_FUNC inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns a packet with elements of \a *from quadrupled. @@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ @@ -299,7 +303,9 @@ template EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) { return a; } -/** \internal \returns the sum of the elements of \a a by block of 4 elements. +/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} * For packet-size smaller or equal to 4, this boils down to a noop. */ template EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_downto4(const Packet& a) +predux_half_dowto4(const Packet& a) { return a; } /** \internal \returns the product of the elements of \a a*/ @@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro * by the current computation. */ template -inline Packet ploadt_ro(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) { return ploadt(from); } @@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/GlobalFunctions.h b/Eigen/src/Core/GlobalFunctions.h index 12828a7c3..563df6e84 100644 --- a/Eigen/src/Core/GlobalFunctions.h +++ b/Eigen/src/Core/GlobalFunctions.h @@ -66,6 +66,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) + EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) @@ -89,7 +90,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) - + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. * * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). @@ -103,17 +104,18 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { - return x.derived().pow(exponent); - } - - template - inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) - pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { - return x.derived().pow(exponent); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) + { + typedef typename internal::promote_scalar_arg::type PromotedExponent; + return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), + typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); } #endif @@ -123,21 +125,21 @@ namespace Eigen * * Example: \include Cwise_array_power_array.cpp * Output: \verbinclude Cwise_array_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase */ template inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> - pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) + pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( x.derived(), exponents.derived() ); } - + /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. * * This function computes the coefficient-wise power between a scalar and an array of exponents. @@ -146,7 +148,7 @@ namespace Eigen * * Example: \include Cwise_scalar_power_array.cpp * Output: \verbinclude Cwise_scalar_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase @@ -156,21 +158,17 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type - pow(const Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); - } - - template - inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) - pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) - { - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + template + EIGEN_DEVICE_FUNC inline + EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) + pow(const Scalar& x, const Eigen::ArrayBase& exponents) { + typedef typename internal::promote_scalar_arg::type PromotedScalar; + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); } #endif diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 06d196702..c437f1a92 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -108,9 +114,10 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 020f939ad..668922ffc 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -43,6 +43,7 @@ template class MapBase enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, + InnerStrideAtCompileTime = internal::traits::InnerStrideAtCompileTime, SizeAtCompileTime = Base::SizeAtCompileTime }; @@ -187,8 +188,11 @@ template class MapBase void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 + // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value: + const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime); + EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride); eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits::Alignment) == 0) - || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); + || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 7a6b999af..72116e144 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#ifdef __CUDA_ARCH__ +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -238,7 +238,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -248,7 +248,7 @@ struct conj_impl }; template -struct conj_impl +struct conj_default_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -258,6 +258,20 @@ struct conj_impl } }; +template struct conj_impl : conj_default_impl {}; + +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl > +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(x.real(), -x.imag()); + } +}; +#endif + template struct conj_retval { @@ -347,31 +361,7 @@ struct norm1_retval * Implementation of hypot * ****************************************************************************/ -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - if(p==RealScalar(0)) return RealScalar(0); - return p * sqrt(RealScalar(1) + qp*qp); - } -}; +template struct hypot_impl; template struct hypot_retval @@ -445,7 +435,12 @@ struct round_retval struct arg_impl { static inline Scalar run(const Scalar& x) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else EIGEN_USING_STD_MATH(arg); + #endif return arg(x); } }; @@ -497,11 +492,11 @@ namespace std_fallback { EIGEN_USING_STD_MATH(exp); Scalar u = exp(x); - if (u == Scalar(1)) { + if (numext::equal_strict(u, Scalar(1))) { return x; } Scalar um1 = u - RealScalar(1); - if (um1 == Scalar(-1)) { + if (numext::equal_strict(um1, Scalar(-1))) { return RealScalar(-1); } @@ -512,7 +507,7 @@ namespace std_fallback { template struct expm1_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH @@ -543,13 +538,13 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); } } template struct log1p_impl { - static inline Scalar run(const Scalar& x) + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH @@ -689,20 +684,27 @@ struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) { - typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; - if(y=x the result converted to an unsigned long is still correct. - std::size_t range = ScalarX(y)-ScalarX(x); - std::size_t offset = 0; - // rejection sampling - std::size_t divisor = 1; - std::size_t multiplier = 1; - if(range::type ScalarU; + // ScalarX is the widest of ScalarU and unsigned int. + // We'll deal only with ScalarX and unsigned int below thus avoiding signed + // types and arithmetic and signed overflows (which are undefined behavior). + typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX; + // The following difference doesn't overflow, provided our integer types are two's + // complement and have the same number of padding bits in signed and unsigned variants. + // This is the case in most modern implementations of C++. + ScalarX range = ScalarX(y) - ScalarX(x); + ScalarX offset = 0; + ScalarX divisor = 1; + ScalarX multiplier = 1; + const unsigned rand_max = RAND_MAX; + if (range <= rand_max) divisor = (rand_max + 1) / (range + 1); + else multiplier = 1 + range / (rand_max + 1); + // Rejection sampling. do { - offset = (std::size_t(std::rand()) * multiplier) / divisor; + offset = (unsigned(std::rand()) * multiplier) / divisor; } while (offset > range); return Scalar(ScalarX(x) + offset); } @@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementatin of is* functions +// Implementation of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) @@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef __CUDA_ARCH__ + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -874,7 +876,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -890,84 +892,6 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) EIGEN_USING_STD_MATH(max); return max EIGEN_NOT_A_MACRO (x,y); } - - -#elif defined(__SYCL_DEVICE_ONLY__) -template -EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) -{ - - return y < x ? y : x; -} - -template -EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) -{ - - return x < y ? y : x; -} - -EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y) -{ - return cl::sycl::min(x,y); -} - -EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y) -{ - return cl::sycl::max(x,y); -} - -EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y) -{ - return cl::sycl::min(x,y); -} - -EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y) -{ - return cl::sycl::max(x,y); -} - -EIGEN_ALWAYS_INLINE long mini(const long & x, const long & y) -{ - return cl::sycl::min(x,y); -} - -EIGEN_ALWAYS_INLINE long maxi(const long & x, const long & y) -{ - return cl::sycl::max(x,y); -} - -EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y) -{ - return cl::sycl::min(x,y); -} - -EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y) -{ - return cl::sycl::max(x,y); -} - - -EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) -{ - return cl::sycl::fmin(x,y); -} - -EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) -{ - return cl::sycl::fmax(x,y); -} - -EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) -{ - return cl::sycl::fmin(x,y); -} - -EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) -{ - return cl::sycl::fmax(x,y); -} - #else template EIGEN_DEVICE_FUNC @@ -981,6 +905,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) +{ + return fmin(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fminl" on HIP yet + return (x < y) ? x : y; +#else + return fminl(x, y); +#endif +} + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -993,7 +935,93 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) +{ + return fmax(x, y); +} +template<> +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) +{ +#if defined(EIGEN_HIPCC) + // no "fmaxl" on HIP yet + return (x > y) ? x : y; +#else + return fmaxl(x, y); #endif +} +#endif + +#if defined(__SYCL_DEVICE_ONLY__) + + +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) +#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) +#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) +#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double) +#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) + +#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ +template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ + return cl::sycl::FUNC(x); \ + } + +#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) + +#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ + template<> \ + EIGEN_DEVICE_FUNC \ + EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \ + return cl::sycl::FUNC(x, y); \ + } + +#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ + SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) + +#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ + SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) + +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) +SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) + +#endif // defined(__SYCL_DEVICE_ONLY__) template @@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); } +EIGEN_DEVICE_FUNC +inline bool abs2(bool x) { return x; } + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); } +#if defined(__SYCL_DEVICE_ONLY__) + SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) +#endif // defined(__SYCL_DEVICE_ONLY__) + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) @@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } -EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } -#endif // defined(__SYCL_DEVICE_ONLY__) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) +#endif //defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1101,8 +1135,7 @@ inline typename internal::pow_impl::result_type pow(const Scala } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float pow(float x, float y) { return cl::sycl::pow(x, y); } -EIGEN_ALWAYS_INLINE double pow(double x, double y) { return cl::sycl::pow(x, y); } +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) #endif // defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } @@ -1110,12 +1143,9 @@ template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return inte template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float isnan(float x) { return cl::sycl::isnan(x); } -EIGEN_ALWAYS_INLINE double isnan(double x) { return cl::sycl::isnan(x); } -EIGEN_ALWAYS_INLINE float isinf(float x) { return cl::sycl::isinf(x); } -EIGEN_ALWAYS_INLINE double isinf(double x) { return cl::sycl::isinf(x); } -EIGEN_ALWAYS_INLINE float isfinite(float x) { return cl::sycl::isfinite(x); } -EIGEN_ALWAYS_INLINE double isfinite(double x) { return cl::sycl::isfinite(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) #endif // defined(__SYCL_DEVICE_ONLY__) template @@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float round(float x) { return cl::sycl::round(x); } -EIGEN_ALWAYS_INLINE double round(double x) { return cl::sycl::round(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) #endif // defined(__SYCL_DEVICE_ONLY__) template @@ -1139,11 +1168,10 @@ T (floor)(const T& x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } -EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1160,11 +1188,10 @@ T (ceil)(const T& x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } -EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1205,8 +1232,7 @@ T sqrt(const T &x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float sqrt(float x) { return cl::sycl::sqrt(x); } -EIGEN_ALWAYS_INLINE double sqrt(double x) { return cl::sycl::sqrt(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) #endif // defined(__SYCL_DEVICE_ONLY__) template @@ -1217,12 +1243,11 @@ T log(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float log(float x) { return cl::sycl::log(x); } -EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits::Real abs(const T &x) { +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} + #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } -EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } +SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1268,16 +1301,31 @@ T exp(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } -EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + float com = ::expf(x.real()); + float res_real = com * ::cosf(x.imag()); + float res_imag = com * ::sinf(x.imag()); + return std::complex(res_real, res_imag); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +std::complex exp(const std::complex& x) { + double com = ::exp(x.real()); + double res_real = com * ::cos(x.imag()); + double res_imag = com * ::sin(x.imag()); + return std::complex(res_real, res_imag); +} #endif template @@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } -EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1308,11 +1355,10 @@ T cos(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } -EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1328,11 +1374,10 @@ T sin(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } -EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1348,11 +1393,10 @@ T tan(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } -EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1367,12 +1411,21 @@ T acos(const T &x) { return acos(x); } +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T acosh(const T &x) { + EIGEN_USING_STD_MATH(acosh); + return acosh(x); +} +#endif + #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); } -EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1387,12 +1440,21 @@ T asin(const T &x) { return asin(x); } +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T asinh(const T &x) { + EIGEN_USING_STD_MATH(asinh); + return asinh(x); +} +#endif + #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); } -EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1407,12 +1469,21 @@ T atan(const T &x) { return atan(x); } +#if EIGEN_HAS_CXX11_MATH +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T atanh(const T &x) { + EIGEN_USING_STD_MATH(atanh); + return atanh(x); +} +#endif + #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); } -EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1429,11 +1500,10 @@ T cosh(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } -EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1449,11 +1519,10 @@ T sinh(const T &x) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } -EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1468,15 +1537,16 @@ T tanh(const T &x) { return tanh(x); } -#if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } -EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH +#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__)) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef __CUDACC__ +#if defined(__SYCL_DEVICE_ONLY__) +SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) +#endif // defined(__SYCL_DEVICE_ONLY__) + +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) { } #if defined(__SYCL_DEVICE_ONLY__) -EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y); } -EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } +SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef __CUDACC__ +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { @@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) { } #endif +#if defined(__SYCL_DEVICE_ONLY__) +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY +#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY +#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE +#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC +#undef SYCL_SPECIALIZE_UNARY_FUNC +#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC +#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC +#undef SYCL_SPECIALIZE_BINARY_FUNC +#endif // defined(__SYCL_DEVICE_ONLY__) + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index ae1386b4c..a23e93ccb 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x) return pdiv(p, q); } +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) +{ + EIGEN_USING_STD_MATH(sqrt); + RealScalar p, qp; + p = numext::maxi(x,y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(y,x) / p; + return p * sqrt(RealScalar(1) + qp*qp); +} + +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static EIGEN_DEVICE_FUNC + inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD_MATH(abs); + return positive_real_hypot(abs(x), abs(y)); + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 675c94e12..6046c8bae 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product @@ -277,6 +268,8 @@ template class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); + EIGEN_DEVICE_FUNC Derived& setUnit(Index i); + EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -294,7 +287,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator!= */ template - inline bool operator==(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const { return cwiseEqual(other).all(); } /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. @@ -302,10 +295,10 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator== */ template - inline bool operator!=(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias noalias(); + NoAlias EIGEN_DEVICE_FUNC noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -335,6 +328,7 @@ template class MatrixBase inline const PartialPivLU lu() const; + EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -344,12 +338,15 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; + + EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -421,15 +418,19 @@ template class MatrixBase ////////// Householder module /////////// + EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template + EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template + EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); @@ -437,8 +438,10 @@ template class MatrixBase ///////// Jacobi module ///////// template + EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template + EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// diff --git a/Eigen/src/Core/NestByValue.h b/Eigen/src/Core/NestByValue.h index 13adf070e..01cf192e9 100644 --- a/Eigen/src/Core/NestByValue.h +++ b/Eigen/src/Core/NestByValue.h @@ -67,25 +67,25 @@ template class NestByValue } template - inline const PacketScalar packet(Index row, Index col) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const { return m_expression.template packet(row, col); } template - inline void writePacket(Index row, Index col, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(row, col, x); } template - inline const PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const { return m_expression.template packet(index); } template - inline void writePacket(Index index, const PacketScalar& x) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x) { m_expression.const_cast_derived().template writePacket(index, x); } @@ -99,7 +99,7 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -inline const NestByValue +EIGEN_DEVICE_FUNC inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); diff --git a/Eigen/src/Core/NoAlias.h b/Eigen/src/Core/NoAlias.h index 33908010b..570283d90 100644 --- a/Eigen/src/Core/NoAlias.h +++ b/Eigen/src/Core/NoAlias.h @@ -33,6 +33,7 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; + EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template @@ -74,10 +75,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only usefull when + * expressions have this flag. Therefore, noalias() is only useful when * the source expression contains a matrix product. * - * Here are some examples where noalias is usefull: + * Here are some examples where noalias is useful: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; @@ -98,7 +99,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias MatrixBase::noalias() +NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() { return NoAlias(derived()); } diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index aebc0c259..b053cff07 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -21,12 +21,14 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { + EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { + EIGEN_DEVICE_FUNC static int run() { using std::log10; using std::ceil; @@ -38,6 +40,38 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { + EIGEN_DEVICE_FUNC + static int run() { return 0; } +}; + + +// default implementation of digits(), based on numeric_limits if specialized, +// 0 for integer types, and log2(epsilon()) otherwise. +template< typename T, + bool use_numeric_limits = std::numeric_limits::is_specialized, + bool is_integer = NumTraits::IsInteger> +struct default_digits_impl +{ + EIGEN_DEVICE_FUNC + static int run() { return std::numeric_limits::digits; } +}; + +template +struct default_digits_impl // Floating point +{ + EIGEN_DEVICE_FUNC + static int run() { + using std::log; + using std::ceil; + typedef typename NumTraits::Real Real; + return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); + } +}; + +template +struct default_digits_impl // Integer +{ + EIGEN_DEVICE_FUNC static int run() { return 0; } }; @@ -118,6 +152,12 @@ template struct GenericNumTraits return internal::default_digits10_impl::run(); } + EIGEN_DEVICE_FUNC + static inline int digits() + { + return internal::default_digits_impl::run(); + } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { @@ -215,6 +255,8 @@ struct NumTraits > static inline RealScalar epsilon() { return NumTraits::epsilon(); } EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + + static inline int digits10() { return NumTraits::digits10(); } }; template<> struct NumTraits diff --git a/Eigen/src/Core/PermutationMatrix.h b/Eigen/src/Core/PermutationMatrix.h index b1fb455b9..acd085301 100644 --- a/Eigen/src/Core/PermutationMatrix.h +++ b/Eigen/src/Core/PermutationMatrix.h @@ -99,13 +99,13 @@ class PermutationBase : public EigenBase #endif /** \returns the number of rows */ - inline Index rows() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline Index cols() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline Index size() const { return Index(indices().size()); } + inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 77f4f6066..da329fd4f 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ @@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type resize(size); } - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) @@ -917,13 +921,19 @@ namespace internal { template struct conservative_resize_like_impl { + #if EIGEN_HAS_TYPE_TRAITS + static const bool IsRelocatable = std::is_trivially_copyable::value; + #else + static const bool IsRelocatable = !NumTraits::RequireInitialization; + #endif static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) - if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns + if ( IsRelocatable + && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns { internal::check_rows_cols_for_overflow::run(rows, cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); @@ -951,8 +961,9 @@ struct conservative_resize_like_impl EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) - if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns + if ( IsRelocatable && + (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns { const Index new_rows = other.rows() - _this.rows(); const Index new_cols = other.cols() - _this.cols(); @@ -980,13 +991,18 @@ template struct conservative_resize_like_impl : conservative_resize_like_impl { - using conservative_resize_like_impl::run; + typedef conservative_resize_like_impl Base; + using Base::run; + using Base::IsRelocatable; static void run(DenseBase& _this, Index size) { const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; - _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); } static void run(DenseBase& _this, const DenseBase& other) @@ -997,7 +1013,10 @@ struct conservative_resize_like_impl const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; - _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + if(IsRelocatable) + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); + else + Base::run(_this.derived(), new_rows, new_cols); if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements); diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index ae0c94b38..70790dbd4 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } @@ -116,7 +116,7 @@ class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Convertion to scalar for inner-products */ +/** Conversion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - operator const Scalar() const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -162,7 +162,7 @@ class ProductImpl public: - EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); @@ -170,7 +170,7 @@ class ProductImpl return internal::evaluator(derived()).coeff(row,col); } - EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 583b7f59e..2787987e7 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -20,7 +20,7 @@ namespace internal { /** \internal * Evaluator of a product expression. * Since products require special treatments to handle all possible cases, - * we simply deffer the evaluation logic to a product_evaluator class + * we simply defer the evaluation logic to a product_evaluator class * which offers more partial specialization possibilities. * * \sa class product_evaluator @@ -32,7 +32,7 @@ struct evaluator > typedef Product XprType; typedef product_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" @@ -55,7 +55,7 @@ struct evaluator, const Product > XprType; typedef evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {} }; @@ -68,7 +68,7 @@ struct evaluator, DiagIndex> > typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index() )) @@ -128,7 +128,7 @@ protected: PlainObject m_result; }; -// The following three shortcuts are enabled only if the scalar types match excatly. +// The following three shortcuts are enabled only if the scalar types match exactly. // TODO: we could enable them for different scalar types when the product is not vectorized. // Dense = Product @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -207,11 +207,17 @@ struct evaluator_assume_aliasing +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; + template struct assignment_from_xpr_op_product { template - static EIGEN_STRONG_INLINE + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -240,19 +246,19 @@ template struct generic_product_impl { template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -263,10 +269,10 @@ struct generic_product_impl // Column major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - typename nested_eval::type actual_lhs(lhs); + ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const // Row major result template -void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - typename nested_eval::type actual_rhs(rhs); + ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -294,37 +300,37 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void operator()(const Dst& dst, const Src& src) const { + template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } @@ -339,19 +345,19 @@ struct generic_product_impl_base typedef typename Product::Scalar Scalar; template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } }; @@ -367,7 +373,7 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); @@ -384,26 +390,52 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template - static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } template - static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) @@ -735,7 +767,8 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -779,7 +812,11 @@ public: _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -791,7 +828,10 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: @@ -845,7 +885,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef __CUDACC__ +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -889,7 +929,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef __CUDACC__ +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/Random.h b/Eigen/src/Core/Random.h index 6faf789c7..486e9ed52 100644 --- a/Eigen/src/Core/Random.h +++ b/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -inline Derived& DenseBase::setRandom() +EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b6e8f8887..e449ef3ac 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -23,22 +23,22 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Derived::IsRowMajor) - ? Derived::MaxColsAtCompileTime - : Derived::MaxRowsAtCompileTime + InnerMaxSize = int(Evaluator::IsRowMajor) + ? Evaluator::MaxColsAtCompileTime + : Evaluator::MaxRowsAtCompileTime }; enum { - MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), + MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize }; @@ -51,8 +51,8 @@ public: public: enum { - Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost - : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost + : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -64,9 +64,9 @@ public: #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Derived::Flags) + EIGEN_DEBUG_VAR(Evaluator::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(PacketSize) @@ -87,88 +87,88 @@ public: /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) { - return func(redux_novec_unroller::run(mat,func), - redux_novec_unroller::run(mat,func)); + return func(redux_novec_unroller::run(eval,func), + redux_novec_unroller::run(eval,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Derived::InnerSizeAtCompileTime, - inner = Start % Derived::InnerSizeAtCompileTime + outer = Start / Evaluator::InnerSizeAtCompileTime, + inner = Start % Evaluator::InnerSizeAtCompileTime }; - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) { - return mat.coeffByOuterInner(outer, inner); + return eval.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { enum { - PacketSize = redux_traits::PacketSize, + PacketSize = redux_traits::PacketSize, HalfLength = Length/2 }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func) { return func.packetOp( - redux_vec_unroller::run(mat,func), - redux_vec_unroller::run(mat,func) ); + redux_vec_unroller::run(eval,func), + redux_vec_unroller::run(eval,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { enum { - index = Start * redux_traits::PacketSize, - outer = index / int(Derived::InnerSizeAtCompileTime), - inner = index % int(Derived::InnerSizeAtCompileTime), - alignment = Derived::Alignment + index = Start * redux_traits::PacketSize, + outer = index / int(Evaluator::InnerSizeAtCompileTime), + inner = index % int(Evaluator::InnerSizeAtCompileTime), + alignment = Evaluator::Alignment }; - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) + static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&) { - return mat.template packetByOuterInner(outer, inner); + return eval.template packetByOuterInner(outer, inner); } }; @@ -176,53 +176,65 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + typedef typename Evaluator::Scalar Scalar; + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); Scalar res; - res = mat.coeffByOuterInner(0, 0); - for(Index i = 1; i < mat.innerSize(); ++i) - res = func(res, mat.coeffByOuterInner(0, i)); - for(Index i = 1; i < mat.outerSize(); ++i) - for(Index j = 0; j < mat.innerSize(); ++j) - res = func(res, mat.coeffByOuterInner(i, j)); + res = eval.coeffByOuterInner(0, 0); + for(Index i = 1; i < xpr.innerSize(); ++i) + res = func(res, eval.coeffByOuterInner(0, i)); + for(Index i = 1; i < xpr.outerSize(); ++i) + for(Index j = 0; j < xpr.innerSize(); ++j) + res = func(res, eval.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : public redux_novec_unroller -{}; - -template -struct redux_impl +template +struct redux_impl + : redux_novec_unroller { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - static Scalar run(const Derived &mat, const Func& func) + typedef redux_novec_unroller Base; + typedef typename Evaluator::Scalar Scalar; + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) { - const Index size = mat.size(); + return Base::run(eval,func); + } +}; + +template +struct redux_impl +{ + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) + { + const Index size = xpr.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) + alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) }; - const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); + const Index alignedStart = internal::first_default_aligned(xpr); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -230,34 +242,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = mat.template packet(alignedStart); + PacketScalar packet_res0 = eval.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); - packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); + packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = mat.coeff(0); + res = eval.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,mat.coeff(index)); + res = func(res,eval.coeff(index)); } return res; @@ -265,130 +277,106 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) + template + EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); - const Index innerSize = mat.innerSize(); - const Index outerSize = mat.outerSize(); + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + const Index innerSize = xpr.innerSize(); + const Index outerSize = xpr.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = mat.template packet(0,0); + PacketType packet_res = eval.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, eval.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(mat, func); + res = redux_impl::run(eval, func, xpr); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Derived::Scalar Scalar; + typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename redux_traits::PacketType PacketScalar; enum { - PacketSize = redux_traits::PacketSize, - Size = Derived::SizeAtCompileTime, + PacketSize = redux_traits::PacketSize, + Size = Evaluator::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) + + template + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE + Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) { - eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + EIGEN_ONLY_USED_FOR_DEBUG(xpr) + eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::run(mat,func)); + Scalar res = func.predux(redux_vec_unroller::run(eval,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(mat,func)); + res = func(res,redux_novec_unroller::run(eval,func)); return res; } else { - return redux_novec_unroller::run(mat,func); + return redux_novec_unroller::run(eval,func); } } }; // evaluator adaptor template -class redux_evaluator +class redux_evaluator : public internal::evaluator<_XprType> { + typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; - typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = evaluator::Flags & ~DirectAccessBit, + Flags = Base::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, - CoeffReadCost = evaluator::CoeffReadCost, - Alignment = evaluator::Alignment + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime }; - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } - EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } - EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index row, Index col) const - { return m_evaluator.coeff(row, col); } - - EIGEN_DEVICE_FUNC - CoeffReturnType coeff(Index index) const - { return m_evaluator.coeff(index); } - - template - PacketType packet(Index row, Index col) const - { return m_evaluator.template packet(row, col); } - - template - PacketType packet(Index index) const - { return m_evaluator.template packet(index); } - EIGEN_DEVICE_FUNC CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template PacketType packetByOuterInner(Index outer, Index inner) const - { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } - const XprType & nestedExpression() const { return m_xpr; } - -protected: - internal::evaluator m_evaluator; - const XprType &m_xpr; }; } // end namespace internal @@ -407,7 +395,7 @@ protected: */ template template -typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); @@ -415,14 +403,16 @@ DenseBase::redux(const Func& func) const typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - return internal::redux_impl::run(thisEval, func); + // The initial expression is passed to the reducer as an additional argument instead of + // passing it as a member of redux_evaluator to help + return internal::redux_impl::run(thisEval, func, derived()); } /** \returns the minimum of all coefficients of \c *this. * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { return derived().redux(Eigen::internal::scalar_min_op()); @@ -432,7 +422,7 @@ DenseBase::minCoeff() const * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { return derived().redux(Eigen::internal::scalar_max_op()); @@ -445,7 +435,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -458,7 +448,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -479,7 +469,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -494,7 +484,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index abb1e5121..ac9502bc4 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -95,6 +95,8 @@ protected: template EIGEN_DEVICE_FUNC void construct(Expression& expr) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); + if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); diff --git a/Eigen/src/Core/Replicate.h b/Eigen/src/Core/Replicate.h index 9960ef884..0b2d6d743 100644 --- a/Eigen/src/Core/Replicate.h +++ b/Eigen/src/Core/Replicate.h @@ -115,7 +115,7 @@ template class Replicate */ template template -const Replicate +EIGEN_DEVICE_FUNC const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -const typename VectorwiseOp::ReplicateReturnType +EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/Eigen/src/Core/ReturnByValue.h b/Eigen/src/Core/ReturnByValue.h index c44b7673b..11dc86d07 100644 --- a/Eigen/src/Core/ReturnByValue.h +++ b/Eigen/src/Core/ReturnByValue.h @@ -79,7 +79,7 @@ template class ReturnByValue template template -Derived& DenseBase::operator=(const ReturnByValue& other) +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 0640cda2a..8b6b3ab03 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -114,7 +114,7 @@ template class Reverse * */ template -inline typename DenseBase::ReverseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +136,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -inline void DenseBase::reverseInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -void VectorwiseOp::reverseInPlace() +EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() { internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); } diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index 504c98f0e..2cf3fa1ef 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -71,7 +71,9 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - {} + { + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); + } EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } @@ -189,7 +191,7 @@ template class SelfAdjointView TriangularView >::type(tmp2); } - typedef SelfAdjointView ConjugateReturnType; + typedef SelfAdjointView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const @@ -322,7 +324,7 @@ public: /** This is the const version of MatrixBase::selfadjointView() */ template template -typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -339,7 +341,7 @@ MatrixBase::selfadjointView() const */ template template -typename MatrixBase::template SelfAdjointViewReturnType::Type +EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h index 719ed72a5..7c89c2e23 100644 --- a/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -15,33 +15,29 @@ namespace Eigen { // TODO generalize the scalar type of 'other' template -EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); return derived(); } diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h index 960a58597..2bf940a26 100644 --- a/Eigen/src/Core/Solve.h +++ b/Eigen/src/Core/Solve.h @@ -34,12 +34,12 @@ template struct s template struct solve_traits { - typedef Matrix PlainObject; + RhsType::MaxColsAtCompileTime>::type PlainObject; }; template @@ -181,7 +181,7 @@ struct Assignment { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 8a4adc229..702a5485c 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -56,7 +56,8 @@ class SolverBase : public EigenBase MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 }; /** Default constructor */ diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index d2fe1e199..77ea3c261 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h @@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } +template +void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) +{ + typedef typename VectorType::Scalar Scalar; + const Index blockSize = 4096; + + typedef typename internal::nested_eval::type VectorTypeCopy; + typedef typename internal::remove_all::type VectorTypeCopyClean; + const VectorTypeCopy copy(vec); + + enum { + CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = vec.size(); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi +typename VectorType::RealScalar +stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) +{ + using std::sqrt; + using std::abs; + + Index n = vec.size(); + + if(n==1) + return abs(vec.coeff(0)); + + typedef typename VectorType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + stable_norm_impl_inner_step(vec, ssq, scale, invScale); + + return scale * sqrt(ssq); +} + +template +typename MatrixType::RealScalar +stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) +{ + using std::sqrt; + + typedef typename MatrixType::RealScalar RealScalar; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of squares + + for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase& _vec) // are used. For any specific computer, each of the assignment // statements can be replaced ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = std::numeric_limits::digits; // number of base-beta digits in mantissa + it = NumTraits::digits(); // number of base-beta digits in mantissa iemin = std::numeric_limits::min_exponent; // minimum exponent iemax = std::numeric_limits::max_exponent; // maximum exponent rbig = (std::numeric_limits::max)(); // largest floating-point number @@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase& _vec) RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - for(typename Derived::InnerIterator it(vec, 0); it; ++it) + + for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); + for(typename Derived::InnerIterator it(vec, j); it; ++it) + { + RealScalar ax = abs(it.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); + } } if(amed!=amed) return amed; // we got a NaN @@ -156,35 +225,7 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - using std::sqrt; - using std::abs; - const Index blockSize = 4096; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of square - - typedef typename internal::nested_eval::type DerivedCopy; - typedef typename internal::remove_all::type DerivedCopyClean; - DerivedCopy copy(derived()); - - enum { - CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*2, internal::evaluator::Alignment>, - typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = size(); - - if(n==1) - return abs(this->coeff(0)); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + if(size()==1) + return numext::abs(coeff(0,0)); + else + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 79b767bcc..d7c204579 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -79,6 +79,7 @@ template class Transpose nestedExpression() { return m_matrix; } /** \internal */ + EIGEN_DEVICE_FUNC void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -168,7 +169,7 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -inline Transpose +EIGEN_DEVICE_FUNC inline Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -180,7 +181,7 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -inline typename DenseBase::ConstTransposeReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -206,7 +207,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -inline const typename MatrixBase::AdjointReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -281,7 +282,7 @@ struct inplace_transpose_selector { // non squ * * \sa transpose(), adjoint(), adjointInPlace() */ template -inline void DenseBase::transposeInPlace() +EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -312,7 +313,7 @@ inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -inline void MatrixBase::adjointInPlace() +EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } diff --git a/Eigen/src/Core/Transpositions.h b/Eigen/src/Core/Transpositions.h index 19c17bb4a..81a4a5855 100644 --- a/Eigen/src/Core/Transpositions.h +++ b/Eigen/src/Core/Transpositions.h @@ -84,7 +84,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be usefull when the target matrix expression is complex, e.g.: + // might be useful when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template @@ -384,7 +384,7 @@ class Transpose > const Product operator*(const MatrixBase& matrix, const Transpose& trt) { - return Product(matrix.derived(), trt.derived()); + return Product(matrix.derived(), trt); } /** \returns the \a matrix with the inverse transpositions applied to the rows. diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 667ef09dc..521de6160 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -65,6 +65,7 @@ template class TriangularBase : public EigenBase inline Index innerStride() const { return derived().innerStride(); } // dummy resize function + EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -470,7 +471,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -488,7 +489,6 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template - EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -497,7 +497,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ @@ -554,7 +554,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -564,7 +564,7 @@ TriangularViewImpl::operator=(const MatrixBase template -void TriangularViewImpl::lazyAssign(const MatrixBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -573,7 +573,7 @@ void TriangularViewImpl::lazyAssign(const MatrixBase template -inline TriangularView& +EIGEN_DEVICE_FUNC inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -583,7 +583,7 @@ TriangularViewImpl::operator=(const TriangularBase template -void TriangularViewImpl::lazyAssign(const TriangularBase& other) +EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -598,7 +598,7 @@ void TriangularViewImpl::lazyAssign(const TriangularBas * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalTo(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -624,6 +624,7 @@ void TriangularBase::evalTo(MatrixBase &other) const */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -633,6 +634,7 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template +EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -715,6 +717,7 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; + EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; @@ -930,7 +933,7 @@ struct triangular_assignment_loop * If the matrix is triangular, the opposite part is set to zero. */ template template -void TriangularBase::evalToLazy(MatrixBase &other) const +EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); diff --git a/Eigen/src/Core/VectorBlock.h b/Eigen/src/Core/VectorBlock.h index d72fbf7e9..0ede5d58e 100644 --- a/Eigen/src/Core/VectorBlock.h +++ b/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly maniputate sub-vector expressions, + * However, if you want to directly manipulate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * diff --git a/Eigen/src/Core/VectorwiseOp.h b/Eigen/src/Core/VectorwiseOp.h index 4fe267e9f..893bc796f 100644 --- a/Eigen/src/Core/VectorwiseOp.h +++ b/Eigen/src/Core/VectorwiseOp.h @@ -670,7 +670,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::ColwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -684,7 +684,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -inline typename DenseBase::RowwiseReturnType +EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 99439c8aa..7fa61969d 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -204,23 +204,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const - { return Packet4cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const - { return Packet4cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { @@ -400,23 +384,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const - { return Packet2cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const - { return Packet2cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 636230944..774e64981 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { @@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) { __m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); - + #if 0 + // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd + // exhibit the same latency/throughput, but it is here for future reference/benchmarking... __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); + #endif } // pabs should be ok @@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 399be0ee4..ba1246722 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -88,9 +88,9 @@ plog(const Packet16f& _x) { // x = x + x - 1.0; // } else { x = x - 1.0; } __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); x = padd(x, tmp); Packet16f x2 = pmul(x, x); @@ -119,8 +119,9 @@ plog(const Packet16f& _x) { x = padd(x, y2); // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, - _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); + return _mm512_mask_blend_ps(iszero_mask, + _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), + p16f_minus_inf); } #endif @@ -257,50 +258,39 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f neg_half = pmul(_x, pset1(-.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), - _mm512_setzero_ps()); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d neg_half = pmul(_x, pset1(-.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d x = _mm512_rsqrt14_pd(_x); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), - _mm512_setzero_pd()); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> @@ -333,20 +323,18 @@ prsqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), - _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); // Fill in NaNs and Infs for the negative/zero entries. __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, p16f_nan, - _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); } template <> @@ -363,14 +351,12 @@ prsqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), - _mm512_rsqrt14_pd(_x)); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); // Fill in NaNs and Infs for the negative/zero entries. __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, p8d_nan, - _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -379,9 +365,9 @@ prsqrt(const Packet8d& _x) { x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } -#else +#elif defined(EIGEN_VECTORIZE_AVX512ER) template <> EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { return _mm512_rsqrt28_ps(x); diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 12b897572..9fbb256a1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -54,6 +54,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, + HasBlend = 0, #if EIGEN_GNUC_AT_LEAST(5, 3) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, @@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); return pairs; } + +#ifdef EIGEN_VECTORIZE_AVX512DQ // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> @@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); return x; } +#else +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); + return x; +} +#endif // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} @@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -558,7 +572,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -568,7 +582,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { pstore(to, pa); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { @@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff))); } template <> EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512d)_mm512_and_si512((__m512i)a, - _mm512_set1_epi64(0x7fffffffffffffff)); + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), + _mm512_set1_epi64(0x7fffffffffffffff))); } #ifdef EIGEN_VECTORIZE_AVX512DQ @@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ @@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + __m512d final_output = _mm512_castpd256_pd512(final_0); return _mm512_insertf64x4(final_output, final_1, 1); } @@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux(const Packet8d& a) { } template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ __m256 lane0 = _mm512_extractf32x8_ps(a, 0); __m256 lane1 = _mm512_extractf32x8_ps(a, 1); @@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { +EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { __m256d lane0 = _mm512_extractf64x4_pd(a, 0); __m256d lane1 = _mm512_extractf64x4_pd(a, 1); __m256d res = _mm256_add_pd(lane0, lane1); @@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, return Packet16f(); } template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, - const Packet8d& /*thenPacket*/, - const Packet8d& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet8d(); +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, + const Packet8d& thenPacket, + const Packet8d& elsePacket) { + __mmask8 m = (ifPacket.select[0] ) + | (ifPacket.select[1]<<1) + | (ifPacket.select[2]<<2) + | (ifPacket.select[3]<<3) + | (ifPacket.select[4]<<4) + | (ifPacket.select[5]<<5) + | (ifPacket.select[6]<<6) + | (ifPacket.select[7]<<7); + return _mm512_mask_blend_pd(m, elsePacket, thenPacket); +} + +template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b) +{ + return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b) +{ + return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b) +{ + return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b)); +} + +template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b) +{ + return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b)); } } // end namespace internal diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 67db2f8ee..3e665730c 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -224,23 +224,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -416,23 +400,8 @@ template<> struct conj_helper return pconj(internal::pmul(a, b)); } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index b3f1ea199..7f4e90f75 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN + Packet4f ret; + __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_min(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN + Packet4f ret; + __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_max(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD @@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +// We also need to redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE @@ -764,7 +784,7 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; @@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) +{ + // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN + Packet2d ret; + __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) +{ + // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN + Packet2d ret; + __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; +} template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } @@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) Packet2d v[2], sum; v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - + #ifdef _BIG_ENDIAN sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); #else @@ -1022,7 +1054,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index 9c2536509..57d1201f4 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, clang does not treat them as device @@ -55,7 +55,7 @@ template struct scalar_difference_op, std::complex struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits>::HasMul + Vectorizable = packet_traits >::HasMul }; typedef typename std::complex result_type; @@ -76,7 +76,7 @@ template struct scalar_product_op, std::complex > // Quotient template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits>::HasDiv + Vectorizable = packet_traits >::HasDiv }; typedef typename std::complex result_type; diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h new file mode 100644 index 000000000..4cfe34e05 --- /dev/null +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -0,0 +1,29 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_CONJ_HELPER_H +#define EIGEN_ARCH_CONJ_HELPER_H + +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ + }; \ + \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ + }; + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/GPU/Half.h similarity index 60% rename from Eigen/src/Core/arch/CUDA/Half.h rename to Eigen/src/Core/arch/GPU/Half.h index db9878796..65b38bbfb 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -13,7 +13,7 @@ // Redistribution and use in source and binary forms, with or without // modification, are permitted. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, @@ -26,15 +26,15 @@ // Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay // in fp32 for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H +#ifndef EIGEN_HALF_GPU_H +#define EIGEN_HALF_GPU_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -49,39 +49,107 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +#if !defined(EIGEN_HAS_GPU_FP16) +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add an implicit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw + typedef __half __half_raw; + #endif // defined(EIGEN_HAS_CUDA_FP16) + +#elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +typedef cl::sycl::half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } + #endif + #elif defined(EIGEN_HAS_CUDA_FP16) + #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; - #endif + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP! + // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif +#endif + explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -136,72 +204,136 @@ struct half : public half_impl::half_base { x = other.x; return *this; } + }; +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -__device__ half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + return __hadd(::__half(a), ::__half(b)); +#else return __hadd(a, b); +#endif } -__device__ half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b); } -__device__ half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } -__device__ half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + return __hdiv(a, b); +#else float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); +#endif } -__device__ half operator - (const half& a) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a); } -__device__ half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { a = a + b; return a; } -__device__ half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { a = a * b; return a; } -__device__ half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { a = a - b; return a; } -__device__ half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { return __heq(a, b); } -__device__ bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { return __hne(a, b); } -__device__ bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { return __hlt(a, b); } -__device__ bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { return __hle(a, b); } -__device__ bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } -__device__ bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } #else // Emulate support for half floats -// Definitions for CPUs and older CUDA, mostly working through conversion +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { @@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) return a; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return float(a) == float(b); + return numext::equal_strict(float(a),float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return float(a) != float(b); + return numext::not_equal_strict(float(a), float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); @@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } -union FP32 { +union float32_bits { unsigned int u; float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; #else - FP32 f; f.f = ff; + float32_bits f; f.f = ff; - const FP32 f32infty = { 255 << 23 }; - const FP32 f16max = { (127 + 16) << 23 }; - const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) return _cvtsh_ss(h.x); #else - const FP32 magic = { 113 << 23 }; + const float32_bits magic = { 113 << 23 }; const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; + float32_bits o; o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits unsigned int exp = shifted_exp & o.u; // just the exponent @@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else return half(::expf(float(a))); @@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return half(::hlog(a)); #else return half(::logf(float(a))); @@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -496,6 +639,13 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { return half_impl::raw_uint16_to_half(0x0800); } @@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -560,14 +711,22 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if (EIGEN_CUDACC_VER < 90000) || \ + defined(EIGEN_HAS_HIP_FP16) return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + #else + return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) namespace Eigen { namespace numext { @@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_CUDA_H +#endif // EIGEN_HALF_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h similarity index 94% rename from Eigen/src/Core/arch/CUDA/MathFunctions.h rename to Eigen/src/Core/arch/GPU/MathFunctions.h index 987a5291c..d2b3a2568 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/GPU/MathFunctions.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H -#define EIGEN_MATH_FUNCTIONS_CUDA_H +#ifndef EIGEN_MATH_FUNCTIONS_GPU_H +#define EIGEN_MATH_FUNCTIONS_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { @@ -100,4 +100,4 @@ double2 prsqrt(const double2& a) } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_CUDA_H +#endif // EIGEN_MATH_FUNCTIONS_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h similarity index 92% rename from Eigen/src/Core/arch/CUDA/PacketMath.h rename to Eigen/src/Core/arch/GPU/PacketMath.h index ad66399e0..ddf37b9c1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_CUDA_H -#define EIGEN_PACKET_MATH_CUDA_H +#ifndef EIGEN_PACKET_MATH_GPU_H +#define EIGEN_PACKET_MATH_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -44,7 +44,11 @@ template<> struct packet_traits : default_packet_traits HasPolygamma = 1, HasErf = 1, HasErfc = 1, + HasI0e = 1, + HasI1e = 1, HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, HasIGammac = 1, HasBetaInc = 1, @@ -73,7 +77,11 @@ template<> struct packet_traits : default_packet_traits HasPolygamma = 1, HasErf = 1, HasErfc = 1, + HasI0e = 1, + HasI1e = 1, HasIGamma = 1, + HasIGammaDerA = 1, + HasGammaSampleDerAlpha = 1, HasIGammac = 1, HasBetaInc = 1, @@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const d return make_double2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { return make_float4(from[0], from[0], from[1], from[1]); } -template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { return make_double2(from[0], from[0]); } @@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const float4*)from); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return __ldg((const double2*)from); #else return make_double2(from[0], from[1]); @@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 return make_double2(__ldg(from+0), __ldg(from+1)); #else return make_double2(from[0], from[1]); @@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; + float tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; kernel.packet[1].x = tmp; @@ -330,4 +338,4 @@ ptranspose(PacketBlock& kernel) { } // end namespace Eigen -#endif // EIGEN_PACKET_MATH_CUDA_H +#endif // EIGEN_PACKET_MATH_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h similarity index 71% rename from Eigen/src/Core/arch/CUDA/PacketMathHalf.h rename to Eigen/src/Core/arch/GPU/PacketMathHalf.h index b9a125b42..8787adcde 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -7,15 +7,16 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H +#ifndef EIGEN_PACKET_MATH_HALF_GPU_H +#define EIGEN_PACKET_MATH_HALF_GPU_H namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) template<> struct is_arithmetic { enum { value = true }; }; @@ -42,70 +43,108 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; -template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else return __half2half2(from); +#endif + +#else // EIGEN_CUDA_ARCH + return __half2half2(from); +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { return *reinterpret_cast(from); } -template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); } -template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { return __halves2half2(from[0], from[0]); } -template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { *reinterpret_cast(to) = from; } -template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { to[0] = __low2half(from); to[1] = __high2half(from); } template<> - __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> -__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { -#if __CUDA_ARCH__ >= 350 +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { return __halves2half2(from[0*stride], from[1*stride]); } -template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { to[stride*0] = __low2half(from); to[stride*1] = __high2half(from); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); } -template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast(&(a))); + *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; return result; } -__device__ EIGEN_STRONG_INLINE void +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __half a1 = __low2half(kernel.packet[0]); __half a2 = __high2half(kernel.packet[0]); @@ -115,17 +154,31 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = __halves2half2(a2, b2); } -template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(a, __hadd(a, __float2half(1.0f))); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; return __halves2half2(a, __float2half(f)); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else float a1 = __low2float(a); @@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons float r2 = a2 + b2; return __floats2half2_rn(r1, r2); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hsub2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else float a1 = __low2float(a); @@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons float r2 = a2 - b2; return __floats2half2_rn(r1, r2); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hneg2(a); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else float a1 = __low2float(a); float a2 = __high2float(a); return __floats2half2_rn(-a1, -a2); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } -template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul2(a, b); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else float a1 = __low2float(a); @@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons float r2 = a2 * b2; return __floats2half2_rn(r1, r2); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hfma2(a, b, c); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else float a1 = __low2float(a); @@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, con float r2 = a2 * b2 + c2; return __floats2half2_rn(r1, r2); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif + +#else // EIGEN_CUDA_ARCH + float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, cons float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); + +#endif } -template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons return __halves2half2(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); + return Eigen::half(__float2half(a1 + a2)); +#endif + #endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hgt(first, second) ? first : second; @@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha float a2 = __high2float(a); return a1 > a2 ? __low2half(a) : __high2half(a); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); return __hlt(first, second) ? first : second; @@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha float a2 = __high2float(a); return a1 < a2 ? __low2half(a) : __high2half(a); #endif + +#endif } -template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if __CUDA_ARCH__ >= 530 +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + +#if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); + return Eigen::half(__float2half(a1 * a2)); +#endif + #endif } -template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = log1pf(a1); @@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expm1f(a1); @@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); } -template<> __device__ EIGEN_STRONG_INLINE +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); } #else -template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = logf(a1); @@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = expf(a1); @@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = sqrtf(a1); @@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return __floats2half2_rn(r1, r2); } -template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); float r1 = rsqrtf(a1); @@ -361,10 +497,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 16, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* fr } template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from.x); } template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploaddup(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + unsigned short e = from[4].x; + unsigned short f = from[5].x; + unsigned short g = from[6].x; + unsigned short h = from[7].x; + result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); + return result; } template<> EIGEN_STRONG_INLINE Packet16h @@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { #endif } +template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { + // FIXME we could do that with bit manipulation + Packet16f af = half2float(a); + Packet16f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, con return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { Packet16f af = half2float(a); Packet16f bf = half2float(b); @@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { return half(predux(from_float)); } +template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux_mul(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h preduxp(const Packet16h* p) { + Packet16f pf[16]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + pf[8] = half2float(p[8]); + pf[9] = half2float(p[9]); + pf[10] = half2float(p[10]); + pf[11] = half2float(p[11]); + pf[12] = half2float(p[12]); + pf[13] = half2float(p[13]); + pf[14] = half2float(p[14]); + pf[15] = half2float(p[15]); + Packet16f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet16h res; + res.x = _mm256_insertf128_si256( + _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)), + _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b) +{ + Packet16h res; + res.x = _mm256_insert_epi16(a.x,b.x,15); + return res; +} + template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) { Packet16h result; @@ -611,20 +831,20 @@ ptranspose(PacketBlock& kernel) { // NOTE: no unpacklo/hi instr in this case, so using permute instr. __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); kernel.packet[0].x = a_p_0; @@ -729,10 +949,10 @@ struct packet_traits : default_packet_traits { AlignedOnScalar = 1, size = 8, HasHalfPacket = 0, - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -781,6 +1001,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); } +template<> EIGEN_STRONG_INLINE Packet8h +ploaddup(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm_set_epi16(d, d, c, c, b, b, a, a); + return result; +} + template<> EIGEN_STRONG_INLINE Packet8h ploadquad(const Eigen::half* from) { Packet8h result; @@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { + // FIXME we could do that with bit manipulation + Packet8f af = half2float(a); + Packet8f rf = pnegate(af); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const return float2half(rf); } +template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = psub(af, bf); + return float2half(rf); +} + template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { Packet8f af = half2float(a); Packet8f bf = half2float(b); @@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& return Eigen::half(reduced); } +template<> EIGEN_STRONG_INLINE Packet8h preduxp(const Packet8h* p) { + Packet8f pf[8]; + pf[0] = half2float(p[0]); + pf[1] = half2float(p[1]); + pf[2] = half2float(p[2]); + pf[3] = half2float(p[3]); + pf[4] = half2float(p[4]); + pf[5] = half2float(p[5]); + pf[6] = half2float(p[6]); + pf[7] = half2float(p[7]); + Packet8f reduced = preduxp(pf); + return float2half(reduced); +} + +template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +{ + __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); + Packet8h res; + res.x = _mm_shuffle_epi8(a.x,m); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),0); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b) +{ + Packet8h res; + res.x = _mm_insert_epi16(a.x,int(b.x),7); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second) + { + if (Offset!=0) + first.x = _mm_alignr_epi8(second.x,first.x, Offset*2); + } +}; + EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { __m128i a = kernel.packet[0].x; @@ -1129,4 +1420,4 @@ ptranspose(PacketBlock& kernel) { } } -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H +#endif // EIGEN_PACKET_MATH_HALF_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h similarity index 86% rename from Eigen/src/Core/arch/CUDA/TypeCasting.h rename to Eigen/src/Core/arch/GPU/TypeCasting.h index aa5fbce8e..57a55d08b 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H namespace Eigen { @@ -19,7 +19,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(a); #else return Eigen::half(a); @@ -37,7 +38,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(static_cast(a)); #else return Eigen::half(static_cast(a)); @@ -55,7 +57,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(a); #else return static_cast(a); @@ -69,7 +72,8 @@ struct functor_traits > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { @@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f } // end namespace Eigen -#endif // EIGEN_TYPE_CASTING_CUDA_H +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 000000000..25375a0a4 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h new file mode 100644 index 000000000..9a45cf51e --- /dev/null +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -0,0 +1,759 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_MSA_H +#define EIGEN_COMPLEX_MSA_H + +#include + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet2cf { + EIGEN_STRONG_INLINE Packet2cf() { + } + EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex& a, + const std::complex& b) { + Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) }; + v = t; + } + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf conjugate(void) const { + return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63)); + } + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + Packet4f v1, v2; + + // Get the real values of a | a1_re | a1_re | a2_re | a2_re | + v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v); + // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | + v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet2cf(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this *= b.conjugate(); + Packet4f s = pmul(b.v, b.v); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { + return Packet2cf(*this) /= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(pnegate(v)); + } + + Packet4f v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] + << "i)," + " (" + << value.v[2] << ", " << value.v[3] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 2, alignment = Aligned16 }; + typedef Packet2cf half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + float f0 = from.real(), f1 = from.imag(); + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return Packet2cf(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet2cf& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( + const std::complex* from, Index stride) { + EIGEN_MSA_DEBUG; + + return Packet2cf(from[0 * stride], from[1 * stride]); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, + const Packet2cf& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = std::complex(from.v[0], from.v[1]); + to += stride; + *to = std::complex(from.v[2], from.v[3]); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + Packet4f value = (Packet4f)preverse((Packet2d)a.v); + value += a.v; + return std::complex(value[0], value[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { + EIGEN_MSA_DEBUG; + + Packet4f sum1, sum2, sum; + + // Add the first two 64-bit float32x2_t of vecs[0] + sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v); + sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v); + sum = padd(sum1, sum2); + + return Packet2cf(sum); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { + EIGEN_MSA_DEBUG; + + return std::complex((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), + (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); +} + +template +struct palign_impl { + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) { + if (Offset == 1) { + first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8); + } + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return internal::pmul(a, pconj(b)); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return internal::pmul(pconj(a), b); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, + const Packet2cf& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) + +template <> +EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet4f tmp = + (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[0].v = + (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); + kernel.packet[1].v = tmp; +} + +template <> +EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, + const Packet2cf& elsePacket) { + return (Packet2cf)(Packet4f)pblend(ifPacket, (Packet2d)thenPacket.v, + (Packet2d)elsePacket.v); +} + +//---------- double ---------- + +struct Packet1cd { + EIGEN_STRONG_INLINE Packet1cd() { + } + EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex& a) { + v[0] = std::real(a); + v[1] = std::imag(a); + } + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) { + } + EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) { + } + EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) { + v = b.v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd conjugate(void) const { + static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 }; + return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR); + } + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + Packet2d v1, v2; + + // Get the real values of a | a1_re | a1_re + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v); + // Get the imag values of a | a1_im | a1_im + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v); + // Multiply the real a with b + v1 = pmul(v1, b.v); + // Multiply the imag a with b + v2 = pmul(v2, b.v); + // Conjugate v2 + v2 = Packet1cd(v2).conjugate().v; + // Swap real/imag elements in v2. + v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + // Add and return the result + v = padd(v1, v2); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) { + *this *= b.conjugate(); + Packet2d s = pmul(b.v, b.v); + s = padd(s, preverse(s)); + v = pdiv(v, s); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { + return Packet1cd(*this) /= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(pnegate(v)); + } + + Packet2d v; +}; + +inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) { + os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]"; + return os; +} + +template <> +struct packet_traits > : default_packet_traits { + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + enum { size = 1, alignment = Aligned16 }; + typedef Packet1cd half; +}; + +template <> +EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { + EIGEN_MSA_DEBUG; + + return Packet1cd(from); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a + b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a - b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return -a; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a.conjugate(); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a * b; +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pand(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(por(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pxor(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return Packet1cd(pandnot(a.v, b.v)); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { + EIGEN_MSA_DEBUG; + + return pset1(*from); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, + const Packet1cd& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); +} + +template <> +EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { + EIGEN_MSA_DEBUG; + + prefetch(reinterpret_cast(addr)); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( + const std::complex* from, Index stride __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + Packet1cd res; + res.v[0] = std::real(from[0]); + res.v[1] = std::imag(from[0]); + return res; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, + const Packet1cd& from, + Index stride + __attribute__((unused))) { + EIGEN_MSA_DEBUG; + + pstore(to, from); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return std::complex(a.v[0], a.v[1]); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { + EIGEN_MSA_DEBUG; + + return vecs[0]; +} + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { + EIGEN_MSA_DEBUG; + + return pfirst(a); +} + +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes + // boundary... + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return internal::pmul(a, pconj(b)); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return internal::pmul(pconj(a), b); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, + const Packet1cd& c) const { + return padd(pmul(x, y), c); + } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) + +template <> +EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { + EIGEN_MSA_DEBUG; + + return a / b; +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip /**/ (const Packet1cd& x) { + EIGEN_MSA_DEBUG; + + return Packet1cd(preverse(Packet2d(x.v))); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d v1, v2; + + v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + // Get the imag values of a + v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); + + kernel.packet[0].v = v1; + kernel.packet[1].v = v2; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_MSA_H diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h new file mode 100644 index 000000000..98e23e36f --- /dev/null +++ b/Eigen/src/Core/arch/MSA/MathFunctions.h @@ -0,0 +1,387 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// Copyright (C) 2016 Gael Guennebaud +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +/* The tanh function of this file is an adaptation of + * template T generic_fast_tanh_float(const T&) + * from MathFunctionsImpl.h. + */ + +#ifndef EIGEN_MATH_FUNCTIONS_MSA_H +#define EIGEN_MATH_FUNCTIONS_MSA_H + +namespace Eigen { + +namespace internal { + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +plog(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + // Convert negative argument into NAN (quiet negative, to be specific). + Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); + Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero); + Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero); + Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN. + Packet4f x = non_neg_x_or_nan; + + // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0. + // N.B. the exponent is one less of what frexpf() would return. + Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x)); + // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf(). + x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0)); + + /* + if (x < SQRTHF) { + x = x + x - 1.0; + } else { + e += 1; + x = x - 1.0; + } + */ + Packet4f xx = padd(x, x); + Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x); + e_int = psub(e_int, ge_mask); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x); + x = psub(x, p4f_1); + Packet4f e = __builtin_msa_ffint_s_w(e_int); + + Packet4f x2 = pmul(x, x); + Packet4f x3 = pmul(x2, x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y, x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y = pmadd(e, p4f_cephes_log_q1, y); + x = __builtin_msa_fmsub_w(x, x2, p4f_half); + x = padd(x, y); + x = pmadd(e, p4f_cephes_log_q2, x); + + // x is now the logarithm result candidate. We still need to handle the + // extreme arguments of zero and positive infinity, though. + // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms + // contain infinities of both signs (see the coefficients and code above). + // INFINITY - INFINITY is NAN. + + // If the argument is +INFINITY, make it the new result candidate. + // To achieve that we choose the smaller of the result candidate and the + // argument. + // This is correct for all finite pairs of values (the logarithm is smaller + // than the argument). + // This is also correct in the special case when the argument is +INFINITY + // and the result candidate is NAN. This is because the fmin.df instruction + // prefers non-NANs to NANs. + x = __builtin_msa_fmin_w(x, non_neg_x_or_nan); + + // If the argument is zero (including -0.0), the result becomes -INFINITY. + Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs); + + return x; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pexp(const Packet4f& _x) { + // Limiting single-precision pexp's argument to [-128, +128] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = _x; + + // Clamp x. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, + (v16u8)p4f_exp_lo); + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, + (v16u8)p4f_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0); + Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add); + Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2); + Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int); + + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1); + x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2); + + Packet4f z = pmul(x, x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // y *= 2**exponent. + y = __builtin_msa_fexp2_w(y, x2_int); + + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); + // The monomial coefficients of the numerator polynomial (odd). + static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); + static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); + // The monomial coefficients of the denominator polynomial (even). + static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); + + Packet4f x = pabs(_x); + Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); + + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, + (v16u8)p4f_tanh_hi); + + // Since the polynomials are odd/even, we need x**2. + Packet4f x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); + p = pmadd(x2, p, p4f_alpha_9); + p = pmadd(x2, p, p4f_alpha_7); + p = pmadd(x2, p, p4f_alpha_5); + p = pmadd(x2, p, p4f_alpha_3); + p = pmadd(x2, p, p4f_alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial q. + Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); + q = pmadd(x2, q, p4f_beta_2); + q = pmadd(x2, q, p4f_beta_0); + + // Divide the numerator by the denominator. + p = pdiv(p, q); + + // Reinstate the sign. + p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0); + + // When the argument is very small in magnitude it's more accurate to just return it. + p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x); + + return p; +} + +template +Packet4f psincos_inner_msa_float(const Packet4f& _x) { + static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); + static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); + static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. + static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); + + Packet4f x = pabs(_x); + + // Translate infinite arguments into NANs. + Packet4f zero_or_nan_if_inf = psub(_x, _x); + x = padd(x, zero_or_nan_if_inf); + // Prevent sin/cos from generating values larger than 1.0 in magnitude + // for very large arguments by setting x to 0.0. + Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg); + x = pand(x, (Packet4f)small_or_nan_mask); + + // Scale x by 4/Pi to find x's octant. + Packet4f y = pmul(x, p4f_cephes_FOPI); + // Get the octant. We'll reduce x by this number of octants or by one more than it. + Packet4i y_int = __builtin_msa_ftrunc_s_w(y); + // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. + // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. + // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). + Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); + Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); + y = __builtin_msa_ffint_s_w(y_int2); + + // Compute the sign to apply to the polynomial. + Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x) + : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29); + + // Get the polynomial selection mask. + // We'll calculate both (sin and cos) polynomials and then select from the two. + Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0); + + // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3 + Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1); + Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2); + Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, tmp1); + x = padd(x, tmp2); + x = padd(x, tmp3); + + // Evaluate the cos(x) polynomial. + y = p4f_coscof_p0; + Packet4f z = pmul(x, x); + y = pmadd(y, z, p4f_coscof_p1); + y = pmadd(y, z, p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + y = __builtin_msa_fmsub_w(y, z, p4f_half); + y = padd(y, p4f_1); + + // Evaluate the sin(x) polynomial. + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + + // Select the correct result from the two polynomials. + y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2) + : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y); + + // Update the sign. + sign_mask = pxor(sign_mask, (Packet4i)y); + y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); + return y; +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +psin(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +pcos(const Packet4f& x) { + return psincos_inner_msa_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d +pexp(const Packet2d& _x) { + // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp + // reach 0 and INFINITY naturally. + static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); + static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); + + Packet2d x = _x; + + // Clamp x. + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, + (v16u8)p2d_exp_lo); + x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, + (v16u8)p2d_exp_hi); + + // Round to nearest integer by adding 0.5 (with x's sign) and truncating. + Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0); + Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add); + Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2); + Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long); + + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1); + x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2); + + x2 = pmul(x, x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul(px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px, psub(qx, px)); + x = pmadd(p2d_2, x, p2d_1); + + // x *= 2**exponent. + x = __builtin_msa_fexp2_d(x, x2_long); + + return x; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_MSA_H diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h new file mode 100644 index 000000000..094c874ee --- /dev/null +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -0,0 +1,1317 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Wave Computing, Inc. +// Written by: +// Chris Larsen +// Alexey Frunze (afrunze@wavecomp.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_MSA_H +#define EIGEN_PACKET_MATH_MSA_H + +#include +#include + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +#if 0 +#define EIGEN_MSA_DEBUG \ + static bool firstTime = true; \ + do { \ + if (firstTime) { \ + std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ + firstTime = false; \ + } \ + } while (0) +#else +#define EIGEN_MSA_DEBUG +#endif + +#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) + +typedef v4f32 Packet4f; +typedef v4i32 Packet4i; +typedef v4u32 Packet4ui; + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } +#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { + os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4f type; + typedef Packet4f half; // Packet2f intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2f intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4i type; + typedef Packet4i half; // Packet2i intrinsics not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, // Packet2i intrinsics not implemented yet + // FIXME check the Has* + HasDiv = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + enum { size = 4, alignment = Aligned16 }; + typedef Packet4f half; +}; + +template <> +struct unpacket_traits { + typedef int32_t type; + enum { size = 4, alignment = Aligned16 }; + typedef Packet4i half; +}; + +template <> +EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { + EIGEN_MSA_DEBUG; + + Packet4f v = { from, from, from, from }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload1(const float* from) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload1(const int32_t* from) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fill_w(*from); +} + +template <> +EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f plset(const float& a) { + EIGEN_MSA_DEBUG; + + static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { + EIGEN_MSA_DEBUG; + + static const Packet4i countdown = { 0, 1, 2, 3 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_subv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_mulv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_div_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_w(c, a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { + EIGEN_MSA_DEBUG; + + // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. + Packet4i value = c; + __asm__("maddv.w %w[value], %w[a], %w[b]\n" + // Outputs + : [value] "+f"(value) + // Inputs + : [a] "f"(a), [b] "f"(b)); + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_min_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_w(a, b); +#else + // This prefers NaNs to numbers. + Packet4i aNaN = __builtin_msa_fcun_w(a, a); + Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); + return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_max_s_w(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pload(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { + EIGEN_MSA_DEBUG; + + float f0 = from[0], f1 = from[1]; + Packet4f v0 = { f0, f0, f0, f0 }; + Packet4f v1 = { f1, f1, f1, f1 }; + return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { + EIGEN_MSA_DEBUG; + + int32_t i0 = from[0], i1 = from[1]; + Packet4i v0 = { i0, i0, i0, i0 }; + Packet4i v1 = { i1, i1, i1, i1 }; + return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { + EIGEN_MSA_DEBUG; + + float f = *from; + Packet4f v = { f, f, f, f }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { + EIGEN_MSA_DEBUG; + + int32_t i = *from; + Packet4i v = { i, i, i, i }; + v[1] = from[stride]; + v[2] = from[2 * stride]; + v[3] = from[3 * stride]; + return v; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; + to += stride; + *to = from[2]; + to += stride; + *to = from[3]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const float* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); +} + +template <> +EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i zero = __builtin_msa_ldi_w(0); + return __builtin_msa_add_a_w(zero, a); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + Packet4f sum; + + tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]); + + sum = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1)); + sum = padd(sum, (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3)); + sum = padd(sum, (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3)); + + return sum; +} + +template <> +EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + Packet4i sum; + + tmp1 = __builtin_msa_ilvr_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)vecs[3], (v4i32)vecs[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)vecs[1], (v4i32)vecs[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)vecs[3], (v4i32)vecs[2]); + + sum = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1)); + sum = padd(sum, (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3)); + sum = padd(sum, (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3)); + + return sum; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return s[0]; +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with min computation. + Packet4f v = __builtin_msa_fmin_w(a, swapped); + v = __builtin_msa_fmin_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +// max +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + // Swap 64-bit halves of a. + Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +#if !EIGEN_FAST_MATH + // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit + // masks of all zeroes/ones in low 64 bits. + v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); + // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. + unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); +#endif + // Continue with max computation. + Packet4f v = __builtin_msa_fmax_w(a, swapped); + v = __builtin_msa_fmax_w( + v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); +#if !EIGEN_FAST_MATH + // Based on the mask select between v and 4 qNaNs. + v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); + v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); +#endif + return v[0]; +} + +template <> +EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { + EIGEN_MSA_DEBUG; + + Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); + m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); + return m[0]; +} + +#define PALIGN_MSA(Offset, Type, Command) \ + template <> \ + struct palign_impl { \ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ + if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 4)); \ + } \ + }; + +PALIGN_MSA(0, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(2, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(3, Packet4f, __builtin_msa_sldi_b) +PALIGN_MSA(0, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(2, Packet4i, __builtin_msa_sldi_b) +PALIGN_MSA(3, Packet4i, __builtin_msa_sldi_b) + +#undef PALIGN_MSA + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); + + kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl + << " " << value.packet[1] << "," << std::endl + << " " << value.packet[2] << "," << std::endl + << " " << value.packet[3] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + v4i32 tmp1, tmp2, tmp3, tmp4; + + tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); + tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); + tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); + tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); + + kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); + kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); + kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); +} + +template <> +EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_w(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_w(a); +#else + Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { + Packet4f v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.w %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, + const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +template <> +EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, + const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], + ifPacket.select[3] }; + Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); + return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +//---------- double ---------- + +typedef v2f64 Packet2d; +typedef v2i64 Packet2l; +typedef v2u64 Packet2ul; + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } +#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } + +inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { + os << "[ " << value[0] << ", " << value[1] << " ]"; + return os; +} + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + // FIXME check the Has* + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasBlend = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + enum { size = 2, alignment = Aligned16 }; + typedef Packet2d half; +}; + +template <> +EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { from, from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fadd_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d plset(const double& a) { + EIGEN_MSA_DEBUG; + + static const Packet2d countdown = { 0.0, 1.0 }; + return padd(pset1(a), countdown); +} + +template <> +EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsub_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmul_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fdiv_d(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fmadd_d(c, a, b); +} + +// Logical Operations are not supported for float, so we have to reinterpret casts using MSA +// intrinsics +template <> +EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + + return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pload(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmin_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + // This prefers numbers to NaNs. + return __builtin_msa_fmax_d(a, b); +#else + // This prefers NaNs to numbers. + v2i64 aNaN = __builtin_msa_fcun_d(a, a); + v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); + return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); +} + +template <> +EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { + EIGEN_MSA_DEBUG; + + Packet2d value = { *from, *from }; + return value; +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { + EIGEN_MSA_DEBUG; + + EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { + EIGEN_MSA_DEBUG; + + Packet2d value; + value[0] = *from; + from += stride; + value[1] = *from; + return value; +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, + Index stride) { + EIGEN_MSA_DEBUG; + + *to = from[0]; + to += stride; + *to = from[1]; +} + +template <> +EIGEN_STRONG_INLINE void prefetch(const double* addr) { + EIGEN_MSA_DEBUG; + + __builtin_prefetch(addr); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return a[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); +} + +template <> +EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d s = padd(a, preverse(a)); + return s[0]; +} + +template <> +EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) { + EIGEN_MSA_DEBUG; + + Packet2d v0 = (Packet2d)__builtin_msa_ilvev_d((v2i64)vecs[1], (v2i64)vecs[0]); + Packet2d v1 = (Packet2d)__builtin_msa_ilvod_d((v2i64)vecs[1], (v2i64)vecs[0]); + + return padd(v0, v1); +} + +// Other reduction functions: +// mul +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + Packet2d p = pmul(a, preverse(a)); + return p[0]; +} + +// min +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmin_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1; +#endif +} + +// max +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); + Packet2d v = __builtin_msa_fmax_d(a, swapped); + return v[0]; +#else + double a0 = a[0], a1 = a[1]; + return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + + return __builtin_msa_fsqrt_d(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { + EIGEN_MSA_DEBUG; + +#if EIGEN_FAST_MATH + return __builtin_msa_frsqrt_d(a); +#else + Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); + return pdiv(ones, psqrt(a)); +#endif +} + +#define PALIGN_MSA(Offset, Type, Command) \ + template <> \ + struct palign_impl { \ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second) { \ + if (Offset != 0) first = (Type)(Command((v16i8)second, (v16i8)first, Offset * 8)); \ + } \ + }; + +PALIGN_MSA(0, Packet2d, __builtin_msa_sldi_b) +PALIGN_MSA(1, Packet2d, __builtin_msa_sldi_b) + +#undef PALIGN_MSA + +inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { + os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; + return os; +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + EIGEN_MSA_DEBUG; + + Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { + Packet2d v = a; + int32_t old_mode, new_mode; + asm volatile( + "cfcmsa %[old_mode], $1\n" + "ori %[new_mode], %[old_mode], 3\n" + "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. + "ctcmsa $1, %[new_mode]\n" + "frint.d %w[v], %w[v]\n" + "ctcmsa $1, %[old_mode]\n" + : // outputs + [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), + [v] "+f"(v) + : // inputs + : // clobbers + ); + return v; +} + +template <> +EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, + const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); + return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_MSA_H diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 57e9b431f..306a309be 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -67,7 +67,7 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { float32x2_t r64; - r64 = vld1_f32((float *)&from); + r64 = vld1_f32((const float *)&from); return Packet2cf(vcombine_f32(r64, r64)); } @@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -265,6 +265,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON @@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } EIGEN_DEVICE_FUNC inline void @@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { @@ -456,6 +458,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h index 6bb05bb92..c48c61023 100644 --- a/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -84,6 +84,98 @@ Packet4f pexp(const Packet4f& _x) return y; } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + + _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000); + + /* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 + */ + _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + Packet4i ux = vreinterpretq_s32_f32(x); + + Packet4i emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, p4i_inv_mant_mask); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half)); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, p4i_0x7f); + Packet4f e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, p4f_1); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF); + Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, p4f_1); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask))); + x = vaddq_f32(x, tmp); + + Packet4f z = vmulq_f32(x,x); + + Packet4f y = p4f_cephes_log_p0; + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p5); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p6); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p7); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_log_p8); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + tmp = vmulq_f32(e, p4f_cephes_log_q1); + y = vaddq_f32(y, tmp); + + + tmp = vmulq_f32(z, p4f_half); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, p4f_cephes_log_q2); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 84a56bdcc..010739380 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -36,12 +36,43 @@ namespace internal { #endif #endif +#if EIGEN_COMP_MSVC + +// In MSVC's arm_neon.h header file, all NEON vector types +// are aliases to the same underlying type __n128. +// We thus have to wrap them to make them different C++ types. +// (See also bug 1428) + +template +struct eigen_packet_wrapper +{ + operator T&() { return m_val; } + operator const T&() const { return m_val; } + eigen_packet_wrapper() {} + eigen_packet_wrapper(const T &v) : m_val(v) {} + eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4ui; + +#else + typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; +#endif // EIGEN_COMP_MSVC + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -51,14 +82,17 @@ typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !EIGEN_ARCH_ARM64 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) @@ -78,7 +112,7 @@ template<> struct packet_traits : default_packet_traits // FIXME check the Has* HasSin = 0, HasCos = 0, - HasLog = 0, + HasLog = 1, HasExp = 1, HasSqrt = 0 }; @@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float32_t f[] = {0, 1, 2, 3}; + const float f[] = {0, 1, 2, 3}; Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1(a), countdown); } diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h new file mode 100644 index 000000000..95d1fd0e4 --- /dev/null +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -0,0 +1,48 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_NEON_H +#define EIGEN_TYPE_CASTING_NEON_H + +namespace Eigen { + +namespace internal { + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return vcvtq_s32_f32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return vcvtq_f32_s32(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_NEON_H diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 5607fe0ab..d075043ce 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -229,23 +229,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { @@ -430,23 +414,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h index 7b5f948e1..4af2c6cae 100644 --- a/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -242,7 +242,7 @@ Packet2d pexp(const Packet2d& _x) return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); } -/* evaluation of 4 sines at onces, using SSE2 intrinsics. +/* evaluation of 4 sines at once, using SSE2 intrinsics. The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 03c8a2c13..2e815c0c5 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#if EIGEN_COMP_PGI +typedef const void * SsePrefetchPtrType; +#else +typedef const char * SsePrefetchPtrType; +#endif + #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 @@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]);; + return (aux[0] * aux[1]) * (aux[2] * aux[3]); } // min @@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } // end namespace Eigen +#if EIGEN_COMP_PGI +// PGI++ does not define the following intrinsics in C++ mode. +static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } +static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } +static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } +static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } +#endif + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index c84893230..c6ca8c716 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -14,6 +14,7 @@ namespace Eigen { namespace internal { +#ifndef EIGEN_VECTORIZE_AVX template <> struct type_casting_traits { enum { @@ -23,11 +24,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - template <> struct type_casting_traits { enum { @@ -37,11 +33,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - template <> struct type_casting_traits { enum { @@ -51,10 +42,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - template <> struct type_casting_traits { enum { @@ -63,6 +50,19 @@ struct type_casting_traits { TgtCoeffRatio = 2 }; }; +#endif + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { // Simply discard the second half of the input diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h new file mode 100644 index 000000000..c1da40d14 --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * InteropHeaders.h + * + * \brief: + * InteropHeaders + * +*****************************************************************/ + +#ifndef EIGEN_INTEROP_HEADERS_SYCL_H +#define EIGEN_INTEROP_HEADERS_SYCL_H +#if defined EIGEN_USE_SYCL +namespace Eigen { + +namespace internal { +#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\ + template<> struct packet_traits : default_packet_traits\ + {\ + typedef packet_type type;\ + typedef packet_type half;\ + enum {\ + Vectorizable = 1,\ + AlignedOnScalar = 1,\ + size=lengths,\ + HasHalfPacket = 0,\ + HasDiv = 1,\ + HasLog = 1,\ + HasExp = 1,\ + HasSqrt = 1,\ + HasRsqrt = 1,\ + HasSin = 1,\ + HasCos = 1,\ + HasTan = 1,\ + HasASin = 1,\ + HasACos = 1,\ + HasATan = 1,\ + HasSinh = 1,\ + HasCosh = 1,\ + HasTanh = 1,\ + HasLGamma = 0,\ + HasDiGamma = 0,\ + HasZeta = 0,\ + HasPolygamma = 0,\ + HasErf = 0,\ + HasErfc = 0,\ + HasIGamma = 0,\ + HasIGammac = 0,\ + HasBetaInc = 0,\ + HasBlend = val,\ + HasMax=1,\ + HasMin=1,\ + HasMul=1,\ + HasAdd=1,\ + HasFloor=1,\ + HasRound=1,\ + HasLog1p=1,\ + HasExpm1=1,\ + HasCeil=1,\ + };\ + }; + +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) +SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) +#undef SYCL_PACKET_TRAITS + + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic { enum { value = true }; }; +SYCL_ARITHMETIC(cl::sycl::cl_float4) +SYCL_ARITHMETIC(cl::sycl::cl_double2) +#undef SYCL_ARITHMETIC + +#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\ +template<> struct unpacket_traits {\ + typedef unpacket_type type;\ + enum {size=lengths, alignment=Aligned16};\ + typedef packet_type half;\ +}; +SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) +SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) + +#undef SYCL_UNPACKET_TRAITS + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_USE_SYCL +#endif // EIGEN_INTEROP_HEADERS_SYCL_H diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h new file mode 100644 index 000000000..422839c6c --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h @@ -0,0 +1,221 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * MathFunctions.h + * + * \brief: + * MathFunctions + * +*****************************************************************/ + +#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H +#define EIGEN_MATH_FUNCTIONS_SYCL_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL) +#define SYCL_PLOG(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog(const packet_type& a) { return cl::sycl::log(a); } + +SYCL_PLOG(cl::sycl::cl_float4) +SYCL_PLOG(cl::sycl::cl_double2) +#undef SYCL_PLOG + +#define SYCL_PLOG1P(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog1p(const packet_type& a) { return cl::sycl::log1p(a); } + +SYCL_PLOG1P(cl::sycl::cl_float4) +SYCL_PLOG1P(cl::sycl::cl_double2) +#undef SYCL_PLOG1P + +#define SYCL_PLOG10(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type plog10(const packet_type& a) { return cl::sycl::log10(a); } + +SYCL_PLOG10(cl::sycl::cl_float4) +SYCL_PLOG10(cl::sycl::cl_double2) +#undef SYCL_PLOG10 + +#define SYCL_PEXP(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pexp(const packet_type& a) { return cl::sycl::exp(a); } + +SYCL_PEXP(cl::sycl::cl_float4) +SYCL_PEXP(cl::sycl::cl_double2) +#undef SYCL_PEXP + +#define SYCL_PEXPM1(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pexpm1(const packet_type& a) { return cl::sycl::expm1(a); } + +SYCL_PEXPM1(cl::sycl::cl_float4) +SYCL_PEXPM1(cl::sycl::cl_double2) +#undef SYCL_PEXPM1 + +#define SYCL_PSQRT(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psqrt(const packet_type& a) { return cl::sycl::sqrt(a); } + +SYCL_PSQRT(cl::sycl::cl_float4) +SYCL_PSQRT(cl::sycl::cl_double2) +#undef SYCL_PSQRT + + +#define SYCL_PRSQRT(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type prsqrt(const packet_type& a) { return cl::sycl::rsqrt(a); } + +SYCL_PRSQRT(cl::sycl::cl_float4) +SYCL_PRSQRT(cl::sycl::cl_double2) +#undef SYCL_PRSQRT + + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSIN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psin(const packet_type& a) { return cl::sycl::sin(a); } + +SYCL_PSIN(cl::sycl::cl_float4) +SYCL_PSIN(cl::sycl::cl_double2) +#undef SYCL_PSIN + + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOS(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pcos(const packet_type& a) { return cl::sycl::cos(a); } + +SYCL_PCOS(cl::sycl::cl_float4) +SYCL_PCOS(cl::sycl::cl_double2) +#undef SYCL_PCOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTAN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type ptan(const packet_type& a) { return cl::sycl::tan(a); } + +SYCL_PTAN(cl::sycl::cl_float4) +SYCL_PTAN(cl::sycl::cl_double2) +#undef SYCL_PTAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PASIN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pasin(const packet_type& a) { return cl::sycl::asin(a); } + +SYCL_PASIN(cl::sycl::cl_float4) +SYCL_PASIN(cl::sycl::cl_double2) +#undef SYCL_PASIN + + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PACOS(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pacos(const packet_type& a) { return cl::sycl::acos(a); } + +SYCL_PACOS(cl::sycl::cl_float4) +SYCL_PACOS(cl::sycl::cl_double2) +#undef SYCL_PACOS + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PATAN(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type patan(const packet_type& a) { return cl::sycl::atan(a); } + +SYCL_PATAN(cl::sycl::cl_float4) +SYCL_PATAN(cl::sycl::cl_double2) +#undef SYCL_PATAN + +/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ +#define SYCL_PSINH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type psinh(const packet_type& a) { return cl::sycl::sinh(a); } + +SYCL_PSINH(cl::sycl::cl_float4) +SYCL_PSINH(cl::sycl::cl_double2) +#undef SYCL_PSINH + +/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ +#define SYCL_PCOSH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pcosh(const packet_type& a) { return cl::sycl::cosh(a); } + +SYCL_PCOSH(cl::sycl::cl_float4) +SYCL_PCOSH(cl::sycl::cl_double2) +#undef SYCL_PCOSH + +/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ +#define SYCL_PTANH(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type ptanh(const packet_type& a) { return cl::sycl::tanh(a); } + +SYCL_PTANH(cl::sycl::cl_float4) +SYCL_PTANH(cl::sycl::cl_double2) +#undef SYCL_PTANH + +#define SYCL_PCEIL(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pceil(const packet_type& a) { return cl::sycl::ceil(a); } + +SYCL_PCEIL(cl::sycl::cl_float4) +SYCL_PCEIL(cl::sycl::cl_double2) +#undef SYCL_PCEIL + + +#define SYCL_PROUND(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pround(const packet_type& a) { return cl::sycl::round(a); } + +SYCL_PROUND(cl::sycl::cl_float4) +SYCL_PROUND(cl::sycl::cl_double2) +#undef SYCL_PROUND + +#define SYCL_FLOOR(packet_type) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pfloor(const packet_type& a) { return cl::sycl::floor(a); } + +SYCL_FLOOR(cl::sycl::cl_float4) +SYCL_FLOOR(cl::sycl::cl_double2) +#undef SYCL_FLOOR + + +#define SYCL_PMIN(packet_type, expr) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pmin(const packet_type& a, const packet_type& b) { return expr; } + +SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) +SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) +#undef SYCL_PMIN + +#define SYCL_PMAX(packet_type, expr) \ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +packet_type pmax(const packet_type& a, const packet_type& b) { return expr; } + +SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) +SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) +#undef SYCL_PMAX + +//#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h new file mode 100644 index 000000000..820a83311 --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/PacketMath.h @@ -0,0 +1,458 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * PacketMath.h + * + * \brief: + * PacketMath + * +*****************************************************************/ + +#ifndef EIGEN_PACKET_MATH_SYCL_H +#define EIGEN_PACKET_MATH_SYCL_H +#include +#if defined EIGEN_USE_SYCL +namespace Eigen { + +namespace internal { + +#define SYCL_PLOADT_RO(address_space_target)\ +template\ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + ploadt_ro(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from) {\ + typedef typename unpacket_traits::type scalar;\ + typedef cl::sycl::multi_ptr multi_ptr;\ + auto res=packet_type(static_cast::type>(0));\ + res.load(0, multi_ptr(const_cast(from)));\ + return res;\ +} + +SYCL_PLOADT_RO(global_space) +SYCL_PLOADT_RO(local_space) + +#undef SYCL_PLOADT_RO + + +#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\ +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + pload##AlignedType(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from) {\ + return ploadt_ro(from);\ + } + +// global space +SYCL_PLOAD(global_space, Unaligned, u) +SYCL_PLOAD(global_space, Aligned, ) + +// local space +SYCL_PLOAD(local_space, Unaligned, u) +SYCL_PLOAD(local_space, Aligned, ) + +// private space +//SYCL_PLOAD(private_space, Unaligned, u) +//SYCL_PLOAD(private_space, Aligned, ) + +#undef SYCL_PLOAD + + +/** \internal \returns a packet version of \a *from. + * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ +#define SYCL_PLOADT(address_space_target)\ +template\ +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from)\ +{\ + if(Alignment >= unpacket_traits::alignment)\ + return pload(from);\ + else\ + return ploadu(from);\ +} + +// global space +SYCL_PLOADT(global_space) +// local space +SYCL_PLOADT(local_space) + +//private_space +// There is no need to specialise it for private space as it can use the GenericPacketMath version + +#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\ + template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + ploadt_ro(const typename unpacket_traits::type * from) { \ + typedef typename unpacket_traits::type scalar;\ + auto res=packet_type(static_cast(0));\ + res. template load(0, const_cast(from));\ + return res;\ + } + +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) +SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) + + +#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\ + pload##alignment_type(const typename unpacket_traits::type * from) { \ + typedef typename unpacket_traits::type scalar;\ + auto res=packet_type(static_cast(0));\ + res. template load(0, const_cast(from));\ + return res;\ + } +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) +SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) + +#undef SYCL_PLOAD_SPECIAL + +#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\ +template<>\ + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ + typename cl::sycl::multi_ptr::pointer_t to, \ + const packet_type& from) {\ + typedef cl::sycl::multi_ptr multi_ptr;\ + from.store(0, multi_ptr(to));\ +} + +// global space +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) + +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) +SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) +SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) + + +#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\ +template<>\ +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(\ + scalar* to,\ + const packet_type& from) {\ + if(Alignment)\ + pstore(to, from);\ + else\ + pstoreu(to,from);\ +} + + +SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned) + +SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned) + +SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned) + +SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned) + + +#undef SYCL_PSTORE_T + +#define SYCL_PSET1(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1(\ + const typename unpacket_traits::type& from) {\ + return packet_type(from);\ +} + +// global space +SYCL_PSET1(cl::sycl::cl_float4) +SYCL_PSET1(cl::sycl::cl_double2) + +#undef SYCL_PSET1 + + +template struct get_base_packet { +template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {} + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {} +}; + +template <> struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) { + return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) { + auto tmp = stride; + to[0] = from.x(); + to[tmp] = from.y(); + to[tmp += stride] = from.z(); + to[tmp += stride] = from.w(); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) { + return cl::sycl::cl_float4(static_cast(a), static_cast(a+1), static_cast(a+2), static_cast(a+3)); + } +}; + +template <> struct get_base_packet { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) { + return cl::sycl::cl_double2(from[0], from[0]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) { + return cl::sycl::cl_double2(from[0*stride], from[1*stride]); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) { + to[0] = from.x(); + to[stride] = from.y(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) { + return cl::sycl::cl_double2(static_cast(a), static_cast(a + 1)); + } +}; + +#define SYCL_PLOAD_DUP(address_space_target)\ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +ploaddup(typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from)\ +{\ + return get_base_packet::get_ploaddup(from); \ +} + +// global space +SYCL_PLOAD_DUP(global_space) +// local_space +SYCL_PLOAD_DUP(local_space) +// private_space +//SYCL_PLOAD_DUP(private_space) +#undef SYCL_PLOAD_DUP + +#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +ploaddup(const typename unpacket_traits::type * from)\ +{ \ + return get_base_packet::get_ploaddup(from); \ +} + +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) +SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) + +#undef SYCL_PLOAD_DUP_SPECILIZE + +#define SYCL_PLSET(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset(const typename unpacket_traits::type& a) {\ + return get_base_packet::set_plset(a);\ +} + +SYCL_PLSET(cl::sycl::cl_float4) +SYCL_PLSET(cl::sycl::cl_double2) + +#undef SYCL_PLSET + + +#define SYCL_PGATHER(address_space_target)\ +template EIGEN_DEVICE_FUNC inline packet_type pgather(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\ + return get_base_packet::get_pgather(from, stride); \ +} + +// global space +SYCL_PGATHER(global_space) +// local space +SYCL_PGATHER(local_space) +// private space +//SYCL_PGATHER(private_space) + +#undef SYCL_PGATHER + + +#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ +pgather(const typename unpacket_traits::type * from, Index stride)\ +{ \ + return get_base_packet::get_pgather(from, stride); \ +} + +SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PGATHER_SPECILIZE + +#define SYCL_PSCATTER(address_space_target)\ +template EIGEN_DEVICE_FUNC inline void pscatter(\ + typename cl::sycl::multi_ptr::type,\ + cl::sycl::access::address_space::address_space_target>::pointer_t to,\ + const packet_type& from, Index stride) {\ + get_base_packet::set_pscatter(to, from, stride);\ +} + +// global space +SYCL_PSCATTER(global_space) +// local space +SYCL_PSCATTER(local_space) +// private space +//SYCL_PSCATTER(private_space) + +#undef SYCL_PSCATTER + + + +#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \ +pscatter(typename unpacket_traits::type * to, const packet_type& from, Index stride)\ +{ \ + get_base_packet::set_pscatter(to, from, stride);\ +} + +SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) +SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) + +#undef SYCL_PSCATTER_SPECILIZE + + +#define SYCL_PMAD(packet_type)\ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\ + const packet_type& b, const packet_type& c){\ + return cl::sycl::mad(a,b,c);\ +} + +SYCL_PMAD(cl::sycl::cl_float4) +SYCL_PMAD(cl::sycl::cl_double2) +#undef SYCL_PMAD + + + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst(const cl::sycl::cl_float4& a) { + return a.x(); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst(const cl::sycl::cl_double2& a) { + return a.x(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux(const cl::sycl::cl_float4& a) { + return a.x() + a.y() + a.z() + a.w(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux(const cl::sycl::cl_double2& a) { + return a.x() + a.y(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max(const cl::sycl::cl_float4& a) { + return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max(const cl::sycl::cl_double2& a) { + return cl::sycl::fmax(a.x(), a.y()); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min(const cl::sycl::cl_float4& a) { + return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min(const cl::sycl::cl_double2& a) { + return cl::sycl::fmin(a.x(), a.y()); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul(const cl::sycl::cl_float4& a) { + return a.x() * a.y() * a.z() * a.w(); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul(const cl::sycl::cl_double2& a) { + return a.x() * a.y(); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs(const cl::sycl::cl_float4& a) { + return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w())); +} +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs(const cl::sycl::cl_double2& a) { + return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y())); +} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +// std::swap(kernel.packet[0].y(), kernel.packet[1].x()); + + tmp = kernel.packet[0].z(); + kernel.packet[0].z() = kernel.packet[2].x(); + kernel.packet[2].x() = tmp; + //std::swap(kernel.packet[0].z(), kernel.packet[2].x()); + + tmp = kernel.packet[0].w(); + kernel.packet[0].w() = kernel.packet[3].x(); + kernel.packet[3].x() = tmp; + + //std::swap(kernel.packet[0].w(), kernel.packet[3].x()); + + tmp = kernel.packet[1].z(); + kernel.packet[1].z() = kernel.packet[2].y(); + kernel.packet[2].y() = tmp; +// std::swap(kernel.packet[1].z(), kernel.packet[2].y()); + + tmp = kernel.packet[1].w(); + kernel.packet[1].w() = kernel.packet[3].y(); + kernel.packet[3].y() = tmp; +// std::swap(kernel.packet[1].w(), kernel.packet[3].y()); + + tmp = kernel.packet[2].w(); + kernel.packet[2].w() = kernel.packet[3].z(); + kernel.packet[3].z() = tmp; +// std::swap(kernel.packet[2].w(), kernel.packet[3].z()); + +} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y(); + kernel.packet[0].y() = kernel.packet[1].x(); + kernel.packet[1].x() = tmp; +//std::swap(kernel.packet[0].y(), kernel.packet[1].x()); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 +pblend(const Selector::size>& ifPacket, + const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) { + cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1, + ifPacket.select[2] ? 0 : -1, + ifPacket.select[3] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +template<> inline cl::sycl::cl_double2 +pblend(const Selector::size>& ifPacket, + const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) { + cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, + ifPacket.select[1] ? 0 : -1); + return cl::sycl::select(thenPacket, elsePacket, condition); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_USE_SYCL +#endif // EIGEN_PACKET_MATH_SYCL_H diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h new file mode 100644 index 000000000..dedd5c84a --- /dev/null +++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TypeCasting.h + * + * \brief: + * TypeCasting + * +*****************************************************************/ + +#ifndef EIGEN_TYPE_CASTING_SYCL_H +#define EIGEN_TYPE_CASTING_SYCL_H + +namespace Eigen { + +namespace internal { +#ifdef __SYCL_DEVICE_ONLY__ +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast(const cl::sycl::cl_float4& a) { + return a. template convert(); +} + + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast(const cl::sycl::cl_int4& a) { + return a. template convert(); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast(const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) { + auto a1=a. template convert(); + auto b1=b. template convert(); + return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast(const cl::sycl::cl_float4& a) { + // Simply discard the second half of the input + return cl::sycl::cl_double2(a.x(), a.y()); +} + +#endif +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SYCL_H diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index d39d2d105..95aba428f 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,6 +15,10 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +#endif + static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,10 +33,14 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; +#else + Packet4f v; +#endif }; template<> struct packet_traits > : default_packet_traits @@ -89,63 +97,27 @@ template<> struct unpacket_traits { typedef std::complex type /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +/* complex first */ template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -163,27 +135,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -193,61 +150,20 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; -} - template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - ptranspose(transpose); - - return padd(transpose.packet[0], transpose.packet[1]); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template struct palign_impl { @@ -258,18 +174,6 @@ struct palign_impl } }; -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -303,6 +207,156 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +/* complex follows */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; +} + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +#else +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +#endif + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -336,13 +390,7 @@ template<> struct conj_helper } }; -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); -} +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -353,11 +401,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -366,13 +409,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -386,6 +422,139 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } +#else +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f a_re, a_im, prod, prod_im; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + + // multiply a_im * b and get the conjugate result + prod_im = a_im * b.v; + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + // permute back to a proper order + prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + + // multiply a_re * b, add prod_im + prod = pmadd(a_re, b.v, prod_im); + + return Packet2cf(prod); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f b1, b2; + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(b2, b2, 8); + b2 = padd(b1, b2); + + return Packet2cf(b2); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + + return pfirst(prod); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = vec_sld(first.v, second.v, 8); + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res = conj_helper().pmul(a, b); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 5c7aa7256..ff33a975f 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,6 +20,50 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#endif + static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -93,40 +137,91 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) +Packet4f pexp(const Packet4f& _x) { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +/* + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = vec_cts(fx, 0); + emm0 = emm0 + p4i_0x7f; + emm0 = emm0 << reinterpret_cast(p4i_23); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), + isnumber_mask);*/ + return _x; +#else Packet4f res; - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); + res.v4f[0] = pexp(_x.v4f[0]); + res.v4f[1] = pexp(_x.v4f[1]); return res; +#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return __builtin_s390_vfsqdb(x); + return vec_sqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = vec_sqrt(x); +#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pset1(1.0) / psqrt(x); +#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); +#endif return res; } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 57b01fc63..0b37f4992 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD @@ -29,7 +29,7 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector int Packet4i; @@ -41,9 +41,14 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +// Z14 has builtin support for float vectors +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +typedef __vector float Packet4f; +#else typedef struct { Packet2d v4f[2]; } Packet4f; +#endif typedef union { int32_t i[4]; @@ -51,11 +56,15 @@ typedef union { int64_t l[2]; uint64_t ul[2]; double d[2]; + float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + Packet4f v4f; +#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -80,7 +89,7 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); @@ -90,6 +99,21 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; +#endif + static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -120,9 +144,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -169,7 +193,11 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + HasExp = 0, +#else HasExp = 1, +#endif HasSqrt = 1, HasRsqrt = 1, HasRound = 1, @@ -258,31 +286,16 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -/* Helper function to simulate a vec_splat_packet4f - */ -template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { - Packet4f splat; - switch (element) { - case 0: - splat.v4f[0] = vec_splat(from.v4f[0], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 1: - splat.v4f[0] = vec_splat(from.v4f[0], 1); - splat.v4f[1] = splat.v4f[0]; - break; - case 2: - splat.v4f[0] = vec_splat(from.v4f[1], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 3: - splat.v4f[0] = vec_splat(from.v4f[1], 1); - splat.v4f[1] = splat.v4f[0]; - break; - } - return splat; + Packet vt; + vt.v4f = v; + s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; + return s; } +#endif + template struct palign_impl @@ -300,31 +313,6 @@ struct palign_impl } }; -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - - template struct palign_impl { @@ -344,16 +332,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet4f vfrom; - vfrom.v4f[0] = vec_ld2f(&from[0]); - vfrom.v4f[1] = vec_ld2f(&from[2]); - return vfrom; -} - template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -372,15 +350,6 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - vec_st2f(from.v4f[0], &to[0]); - vec_st2f(from.v4f[1], &to[2]); -} - - template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -397,13 +366,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - Packet4f to; - to.v4f[0] = pset1(static_cast(from)); - to.v4f[1] = to.v4f[0]; - return to; -} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -416,17 +378,6 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat_packet4f<0>(a3); - a1 = vec_splat_packet4f<1>(a3); - a2 = vec_splat_packet4f<2>(a3); - a3 = vec_splat_packet4f<3>(a3); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -449,16 +400,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -477,16 +418,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - pstore((float *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -496,160 +427,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] + b.v4f[0]; - c.v4f[1] = a.v4f[1] + b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] - b.v4f[0]; - c.v4f[1] = a.v4f[1] - b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] * b.v4f[0]; - c.v4f[1] = a.v4f[1] * b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] / b.v4f[0]; - c.v4f[1] = a.v4f[1] / b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - Packet4f c; - c.v4f[0] = -a.v4f[0]; - c.v4f[1] = -a.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - Packet4f res; - res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); - res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); - res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); - return res; -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_round(a.v4f[0]); - res.v4f[1] = vec_round(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_ceil(a.v4f[0]); - res.v4f[1] = vec_ceil(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_floor(a.v4f[0]); - res.v4f[1] = vec_floor(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -659,14 +482,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - p.v4f[1] = vec_splat(p.v4f[0], 1); - p.v4f[0] = vec_splat(p.v4f[0], 0); - return p; -} - template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -674,15 +489,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -695,23 +507,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - Packet4f rev; - rev.v4f[0] = preverse(a.v4f[1]); - rev.v4f[1] = preverse(a.v4f[0]); - return rev; -} - template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = pabs(a.v4f[0]); - res.v4f[1] = pabs(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -730,13 +527,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet2d sum; - sum = padd(a.v4f[0], a.v4f[1]); - double first = predux(sum); - return static_cast(first); -} template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { @@ -777,21 +567,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) return sum; } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - transpose.packet[2] = vecs[2]; - transpose.packet[3] = vecs[3]; - ptranspose(transpose); - - Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); - sum = padd(sum, transpose.packet[2]); - sum = padd(sum, transpose.packet[3]); - return sum; -} - // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -806,12 +581,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - // Return predux_mul of the subvectors product - return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); -} - // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -826,14 +595,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet2d b, res; - b = pmin(a.v4f[0], a.v4f[1]); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -849,14 +610,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet2d b, res; - b = pmax(a.v4f[0], a.v4f[1]); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -877,6 +630,321 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +/* z13 has no vector float support so we emulate that with double + z14 has proper vector float support. +*/ +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -915,12 +983,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -931,13 +993,197 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); return result; } +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore((float*)af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +#endif + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 4153b877c..9765cc763 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef __CUDACC__ +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 96747bac7..401d597d8 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -255,7 +255,7 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) -// typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { - EIGEN_USING_STD_MATH(sqrt) - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - return p * sqrt(Scalar(1) + qp*qp); + // This functor is used by hypotNorm only for which it is faster to first apply abs + // on all coefficients prior to reduction through hypot. + // This way we avoid calling abs on positive and real entries, and this also permits + // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes + // through the same functor... + return internal::positive_real_hypot(x,y); } }; template @@ -443,7 +436,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -462,7 +455,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index 6a30466fb..b03be0269 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -44,16 +44,16 @@ struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), - m_interPacket(plset(0)), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { + typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : (m_high - (m_size1-i)*m_step); + return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + i*m_step); + return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); } template @@ -63,7 +63,7 @@ struct linspaced_op_impl // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) if(m_flip) { - Packet pi = padd(pset1(Scalar(i-m_size1)),m_interPacket); + Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); if(i==0) res = pinsertfirst(res, m_low); @@ -71,7 +71,7 @@ struct linspaced_op_impl } else { - Packet pi = padd(pset1(Scalar(i)),m_interPacket); + Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); if(i==m_size1-unpacket_traits::size+1) res = pinsertlast(res, m_high); @@ -83,7 +83,6 @@ struct linspaced_op_impl const Scalar m_high; const Index m_size1; const Scalar m_step; - const Packet m_interPacket; const bool m_flip; }; diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h index 6df3fa501..9c1d75850 100644 --- a/Eigen/src/Core/functors/StlFunctors.h +++ b/Eigen/src/Core/functors/StlFunctors.h @@ -83,13 +83,17 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif +#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +// std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +// std::binary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +#endif #ifdef EIGEN_STDEXT_SUPPORT diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index bfc046556..c1cc2ab3b 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -701,7 +701,7 @@ template struct scalar_isnan_op { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { #if defined(__SYCL_DEVICE_ONLY__) return numext::isnan(a); -#else +#else return (numext::isnan)(a); #endif } @@ -815,7 +815,7 @@ struct scalar_sign_op { template struct functor_traits > { enum { - Cost = + Cost = NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), @@ -823,6 +823,34 @@ struct functor_traits > }; }; +/** \internal + * \brief Template functor to compute the logistic function of a scalar + * \sa class CwiseUnaryOp, ArrayBase::logistic() + */ +template +struct scalar_logistic_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + const T one = T(1); + return one / (one + numext::exp(-x)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, + PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && + packet_traits::HasNegate && packet_traits::HasExp + }; +}; + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce5..b012691c1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -390,6 +390,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -496,6 +497,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -580,7 +582,7 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket -const DoublePacket& predux_downto4(const DoublePacket &a) +const DoublePacket& predux_half_dowto4(const DoublePacket &a) { return a; } @@ -626,6 +628,7 @@ public: typedef typename packet_traits::type ScalarPacket; typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -777,6 +780,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -972,7 +976,7 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1035,9 +1039,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1045,9 +1049,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1055,9 +1059,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1134,9 +1138,9 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1244,10 +1248,10 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1257,10 +1261,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1337,8 +1341,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1431,15 +1435,15 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); r1.storePacket(0 * Traits::ResPacketSize, R1); - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); + R0 = r2.template loadPacket(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); r2.storePacket(0 * Traits::ResPacketSize, R0); @@ -1504,7 +1508,7 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); r0.storePacket(0 * Traits::ResPacketSize, R0); } @@ -1523,13 +1527,13 @@ void gebp_kernel::half SResPacketHalf; + const int SResPacketHalfSize = unpacket_traits::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -1596,13 +1600,13 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1725,9 +1728,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1745,8 +1748,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1763,7 +1766,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1793,19 +1796,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1837,7 +1839,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1924,7 +1926,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; @@ -1971,10 +1973,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2075,7 +2077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6440e1d09..f49abcad5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; @@ -108,7 +108,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users += threads; + info[tid].users = threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -146,7 +146,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -439,8 +447,8 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -448,8 +456,8 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 7122efa60..ec2825bf0 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -110,7 +110,6 @@ struct general_matrix_matrix_triangular_product enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, - RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 }; Index size = mat.cols(); + if(SkipDiag) + size--; Index depth = actualLhs.cols(); typedef internal::gemm_blocking_space internal::general_matrix_matrix_triangular_product + IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)> ::run(size, depth, - &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), + &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking); } }; template template -TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { + EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 5b7c15cca..49565c070 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,7 +37,7 @@ namespace Eigen { namespace internal { -template +template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; @@ -52,7 +52,7 @@ struct general_matrix_matrix_triangular_product& blocking) \ { \ - if (lhs==rhs) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 41d8242e1..767feb99d 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -48,7 +48,7 @@ typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, @@ -201,7 +201,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run( }; template -EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index e3a5d5892..6e36c2b3c 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float) EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index c2f084c82..92e9b0d9f 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -10,6 +10,10 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H +#if EIGEN_HAS_CXX11_ATOMIC +#include +#endif + namespace Eigen { namespace internal { @@ -75,8 +79,17 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC + std::atomic sync; + std::atomic users; +#else Index volatile sync; int volatile users; +#endif Index lhs_start; Index lhs_length; @@ -87,11 +100,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types -#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redisigned anyway. + // parallelizer mechanism has to be redesigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -117,7 +133,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, // compute the number of threads we are going to use Index threads = std::min(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index da6f82abc..c84c71609 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -352,7 +352,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 \ @@ -81,13 +81,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -144,20 +144,26 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -197,13 +203,13 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -259,15 +265,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 3fd180e6c..d38fd72b2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at onces that allows to both reduce + * This algorithm processes 2 columns at once that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ @@ -27,7 +27,8 @@ template -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23accf..1238345e3 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f038d686f..39c5b59ff 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae364111..09209f733 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct conj_expr_if template template -SelfAdjointView& SelfAdjointView +EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0b..85fd744b9 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -137,7 +137,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -145,7 +151,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -216,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -284,7 +290,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -292,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; @@ -393,7 +400,9 @@ struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -405,8 +414,9 @@ struct triangular_product_impl typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -431,6 +441,21 @@ struct triangular_product_impl &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6bb..a25197ab0 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -75,7 +75,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -172,7 +172,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +180,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -282,7 +289,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +297,17 @@ struct product_triangular_matrix_matrix_trmm struct trmv_selector typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -274,6 +275,12 @@ template struct trmv_selector else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template struct trmv_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template struct trmv_selector actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 07bf26ce5..3d47a2b94 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b86..8ff2e9d9d 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 \ struct triangular_solve_matrix \ { \ @@ -80,18 +80,24 @@ struct triangular_solve_matrix \ struct triangular_solve_matrix \ { \ @@ -133,16 +139,22 @@ struct triangular_solve_matrix0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index b1791fb3a..a32630ed7 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -24,7 +24,7 @@ struct gebp_kernel; template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -156,11 +156,9 @@ class BlasVectorMapper { }; template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class BlasLinearMapper +{ +public: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -171,29 +169,25 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); - } - - protected: +protected: Scalar *m_data; }; // Lightweight helper class to access matrix coefficients. template -class blas_data_mapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class blas_data_mapper +{ +public: typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -218,8 +212,9 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -227,10 +222,6 @@ class blas_data_mapper { return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); - } - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -251,7 +242,7 @@ class blas_data_mapper { return internal::first_default_aligned(m_data, size); } - protected: +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; @@ -289,8 +280,8 @@ template struct blas_traits ExtractType, typename _ExtractType::PlainObject >::type DirectLinearAccessType; - static inline ExtractType extract(const XprType& x) { return x; } - static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } + static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -318,8 +309,8 @@ struct blas_traits, const CwiseNullaryOp typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline Scalar extractScalarFactor(const XprType& x) + static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h new file mode 100644 index 000000000..e75c7d89e --- /dev/null +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -0,0 +1,436 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CONFIGURE_VECTORIZATION_H +#define EIGEN_CONFIGURE_VECTORIZATION_H + +// FIXME: not sure why this is needed, perhaps it is not needed anymore. +#ifdef __NVCC__ + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif + +//------------------------------------------------------------------------------------------ +// Static and dynamic alignment control +// +// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES +// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. +// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, +// a default value is automatically computed based on architecture, compiler, and OS. +// +// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} +// to be used to declare statically aligned buffers. +//------------------------------------------------------------------------------------------ + + +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, + * so that vectorization doesn't affect binary compatibility. + * + * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link + * vectorized and non-vectorized code. + */ +#if (defined EIGEN_CUDACC) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) + #define EIGEN_ALIGNOF(x) __alignof(x) +#elif EIGEN_COMP_SUNCC + // FIXME not sure about this one: + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) + #define EIGEN_ALIGNOF(x) __alignof(x) +#else + #error Please tell me what is the equivalent of __attribute__((aligned(n))) and __alignof(x) for your compiler +#endif + +// If the user explicitly disable vectorization, then we also disable alignment +#if defined(EIGEN_DONT_VECTORIZE) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 +#elif defined(__AVX512F__) + // 64 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + // 32 bytes static alignment is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif + + +// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense +#define EIGEN_MIN_ALIGN_BYTES 16 + +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. + +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif + +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated +// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif + +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + + // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES + + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable + // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always + // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. + // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. + // 4.8 and newer seem definitely unaffected. + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + + // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif + +#endif + +// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. +// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) +// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). +// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. + + +// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif + + +// Dynamic alignment control + +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif + +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif + +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif + + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +//---------------------------------------------------------------------- + + + +// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into +// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif + + +// The following (except #include and _M_IX86_FP ??) can likely be +// removed as gcc 4.1 and msvc 2008 are not supported anyways. +#if EIGEN_COMP_MSVC + #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled + #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif + + +#ifndef EIGEN_DONT_VECTORIZE + + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + + // Defines symbols for compile-time detection of which instructions are + // used. + // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + + // Detect sse3/ssse3/sse4: + // gcc and icc defines __SSE3__, ... + // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you + // want to force the use of those instructions with msvc. + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __FMA__ + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_FMA + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif + #endif + + // include files + + // This extern "C" works around a MINGW-w64 compilation issue + // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 + // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). + // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations + // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; + // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. + // notice that since these are C headers, the extern "C" is theoretically needed anyways. + extern "C" { + // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. + // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } // end extern "C" + + #elif defined __VSX__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif defined __ALTIVEC__ + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + + #elif (defined __ARM_NEON) || (defined __ARM_NEON__) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + + #elif (defined __s390x__ && defined __VEC__) + + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ZVECTOR + #include + + #elif defined __mips_msa + + // Limit MSA optimizations to little-endian CPUs for now. + // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? + #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + #if defined(__LP64__) + #define EIGEN_MIPS_64 + #else + #define EIGEN_MIPS_32 + #endif + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_MSA + #include + #endif + + #endif +#endif + +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C +#endif + +#if defined EIGEN_CUDACC + #define EIGEN_VECTORIZE_GPU + #include + #if EIGEN_CUDACC_VER >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif + +#if defined(EIGEN_HAS_CUDA_FP16) + #include + #include +#endif + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + #define EIGEN_VECTORIZE_GPU + #include + + #define EIGEN_HAS_HIP_FP16 + #include + + #define HIP_PATCH_WITH_NEW_FP16 18215 + #if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16) + #define EIGEN_HAS_OLD_HIP_FP16 + // Old HIP implementation does not have a explicit typedef for "half2" + typedef __half2 half2; + #endif + +#endif + + +/** \brief Namespace containing all symbols from the %Eigen library. */ +namespace Eigen { + +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#elif defined(EIGEN_VECTORIZE_MSA) + return "MIPS MSA"; +#else + return "None"; +#endif +} + +} // end namespace Eigen + + +#endif // EIGEN_CONFIGURE_VECTORIZATION_H diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 612dbf5e8..a5f63a9b5 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -470,6 +470,7 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, + MSA = 0x5, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -478,6 +479,8 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON +#elif defined EIGEN_VECTORIZE_MSA + Target = MSA #else Target = Generic #endif diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 4431f2fc4..6e93bbc0f 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -45,16 +45,25 @@ #pragma clang diagnostic ignored "-Wabsolute-value" #endif -#elif defined __GNUC__ && __GNUC__>=6 +#elif defined __GNUC__ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma GCC diagnostic push #endif - #pragma GCC diagnostic ignored "-Wignored-attributes" + // g++ warns about local variables shadowing member functions, which is too strict + #pragma GCC diagnostic ignored "-Wshadow" + #if __GNUC__ == 4 && __GNUC_MINOR__ < 8 + // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions: + #pragma GCC diagnostic ignored "-Wtype-limits" + #endif + #if __GNUC__>=6 + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif #endif #if defined __NVCC__ + #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message @@ -72,6 +81,15 @@ #pragma diag_suppress 2671 #pragma diag_suppress 2735 #pragma diag_suppress 2737 + #pragma diag_suppress 2739 #endif +#else +// warnings already disabled: +# ifndef EIGEN_WARNINGS_DISABLED_2 +# define EIGEN_WARNINGS_DISABLED_2 +# elif defined(EIGEN_INTERNAL_DEBUGGING) +# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!" +# endif + #endif // not EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h index ab01c857f..40e16fdb4 100644 --- a/Eigen/src/Core/util/IndexedViewHelper.h +++ b/Eigen/src/Core/util/IndexedViewHelper.h @@ -13,13 +13,6 @@ namespace Eigen { -/** \namespace Eigen::placeholders - * \ingroup Core_Module - * - * Namespace containing symbolic placeholder and identifiers - */ -namespace placeholders { - namespace internal { struct symbolic_last_tag {}; } @@ -35,36 +28,35 @@ struct symbolic_last_tag {}; * A typical usage example would be: * \code * using namespace Eigen; - * using Eigen::placeholders::last; + * using Eigen::last; * VectorXd v(n); * v(seq(2,last-2)).setOnes(); * \endcode * * \sa end */ -static const Symbolic::SymbolExpr last; +static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last -/** \var end +/** \var lastp1 * \ingroup Core_Module * - * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last+1 element/row/columns - * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). + * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically + * reference the last+1 element/row/columns of the underlying vector or matrix once + * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). * * This symbolic placeholder support standard arithmetic operation. - * It is essentially an alias to last+1 + * It is essentially an alias to last+fix<1>. * * \sa last */ #ifdef EIGEN_PARSED_BY_DOXYGEN -static const auto end = last+1; +static const auto lastp1 = last+fix<1>; #else // Using a FixedExpr<1> expression is important here to make sure the compiler // can fully optimize the computation starting indices with zero overhead. -static const Symbolic::AddExpr,Symbolic::ValueExpr > > end(last+fix<1>()); +static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); #endif -} // end namespace placeholders - namespace internal { // Replace symbolic last/end "keywords" by their true runtime value @@ -74,9 +66,9 @@ template FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } template -Index eval_expr_given_size(const Symbolic::BaseExpr &x, Index size) +Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) { - return x.derived().eval(placeholders::last=size-1); + return x.derived().eval(last=size-1); } // Extract increment/step at compile time @@ -117,7 +109,7 @@ template<> struct get_compile_time_incr { enum { value = 1 }; // 1 or 0 ?? }; -// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operatro[](int) methods) +// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) template struct IndexedViewCompatibleType::value>::type> { // Here we could simply use Array, but maybe it's less work for the compiler to use @@ -127,13 +119,13 @@ struct IndexedViewCompatibleType -struct IndexedViewCompatibleType::value>::type> { +struct IndexedViewCompatibleType::value>::type> { typedef SingleRange type; }; template -typename enable_if::value,SingleRange>::type +typename enable_if::value,SingleRange>::type makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { return eval_expr_given_size(id,size); } @@ -172,14 +164,21 @@ template struct get_compile_time_incr > { } // end namespace internal -namespace placeholders { - /** \var all * \ingroup Core_Module * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns */ -static const Eigen::internal::all_t all; +static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + +namespace placeholders { + typedef symbolic::SymbolExpr last_t; + typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; + typedef Eigen::internal::all_t all_t; + + EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all + EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last + EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end } } // end namespace Eigen diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index c7d3b1c06..bf99cd8ab 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -151,9 +151,9 @@ struct get_fixed_value,Default> { static const int value = N; }; -template Index get_runtime_value(const T &x) { return x; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } #if !EIGEN_HAS_CXX14 -template Index get_runtime_value(FixedInt (*)()) { return N; } +template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif // Cleanup integer/FixedInt/VariableAndFixedInt/etc types: diff --git a/Eigen/src/Core/util/MKL_support.h b/Eigen/src/Core/util/MKL_support.h index 26b59669e..17963fad4 100755 --- a/Eigen/src/Core/util/MKL_support.h +++ b/Eigen/src/Core/util/MKL_support.h @@ -49,12 +49,17 @@ #define EIGEN_USE_LAPACKE #endif -#if defined(EIGEN_USE_MKL_VML) +#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL) #define EIGEN_USE_MKL #endif + #if defined EIGEN_USE_MKL -# include +# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL) +# define MKL_DIRECT_CALL +# define MKL_DIRECT_CALL_JUST_SET +# endif +# include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ # ifndef INTEL_MKL_VERSION # undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ @@ -68,6 +73,9 @@ # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT # undef EIGEN_USE_LAPACKE +# ifdef MKL_DIRECT_CALL_JUST_SET +# undef MKL_DIRECT_CALL +# endif # endif #endif @@ -108,6 +116,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +133,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 29c796647..3af6c4e37 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -11,6 +11,10 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H +//------------------------------------------------------------------------------------------ +// Eigen version and basic defaults +//------------------------------------------------------------------------------------------ + #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 3 #define EIGEN_MINOR_VERSION 90 @@ -19,7 +23,40 @@ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ EIGEN_MINOR_VERSION>=z)))) +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#else +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor +#endif + +#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#endif + +// Upperbound on the C++ version to use. +// Expected values are 03, 11, 14, 17, etc. +// By default, let's use an arbitrarily large C++ version. +#ifndef EIGEN_MAX_CPP_VER +#define EIGEN_MAX_CPP_VER 99 +#endif + +/** Allows to disable some optimizations which might affect the accuracy of the result. + * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. + * They currently include: + * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. + */ +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 +#endif + +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +// 131072 == 128 KB +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif + +//------------------------------------------------------------------------------------------ // Compiler identification, EIGEN_COMP_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ @@ -73,12 +110,17 @@ // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: // name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 +// 2017-14.1 15.0 1910 +// 2017-14.11 15.3 1911 +// 2017-14.12 15.5 1912 +// 2017-14.13 15.6 1913 +// 2017-14.14 15.7 1914 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) @@ -142,7 +184,11 @@ #endif + +//------------------------------------------------------------------------------------------ // Architecture identification, EIGEN_ARCH_* +//------------------------------------------------------------------------------------------ + #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) #define EIGEN_ARCH_x86_64 1 @@ -212,7 +258,9 @@ +//------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* +//------------------------------------------------------------------------------------------ /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant #if defined(__unix__) || defined(__unix) @@ -314,6 +362,108 @@ #endif +//------------------------------------------------------------------------------------------ +// Detect GPU compilers and architectures +//------------------------------------------------------------------------------------------ + +// NVCC is not supported as the target platform for HIPCC +// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive +#if defined(__NVCC__) && defined(__HIPCC__) + #error "NVCC as the target platform for HIPCC is currently not supported." +#endif + +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + // Means the compiler is either nvcc or clang with CUDA enabled + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + // Means we are generating code for the device + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) + #define EIGEN_CUDACC_VER __CUDACC_VER__ +#else + #define EIGEN_CUDACC_VER 0 +#endif + +#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) + // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) + #define EIGEN_HIPCC __HIPCC__ + + // We need to include hip_runtime.h here because it pulls in + // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ + // ++ host_defines.h which contains the defines for the __host__ and __device__ macros + #include + + #if defined(__HIP_DEVICE_COMPILE__) + // analogous to EIGEN_CUDA_ARCH, but for HIP + #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ + #endif +#endif + +// Unify CUDA/HIPCC + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + +//------------------------------------------------------------------------------------------ +// Detect Compiler/Architecture/OS specific features +//------------------------------------------------------------------------------------------ #if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG // see bug 89 @@ -322,20 +472,6 @@ #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 #endif -// This macro can be used to prevent from macro expansion, e.g.: -// std::max EIGEN_NOT_A_MACRO(a,b) -#define EIGEN_NOT_A_MACRO - -#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor -#else -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor -#endif - -#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t -#endif - // Cross compiler wrapper around LLVM's __has_builtin #ifdef __has_builtin # define EIGEN_HAS_BUILTIN(x) __has_builtin(x) @@ -357,13 +493,6 @@ #define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 #endif -// Upperbound on the C++ version to use. -// Expected values are 03, 11, 14, 17, etc. -// By default, let's use an arbitrarily large C++ version. -#ifndef EIGEN_MAX_CPP_VER -#define EIGEN_MAX_CPP_VER 99 -#endif - #if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) #define EIGEN_HAS_CXX11 1 #else @@ -389,6 +518,8 @@ #endif // Does the compiler support C99? +// Need to include to make sure _GLIBCXX_USE_C99 gets defined +#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ @@ -402,18 +533,35 @@ #endif // Does the compiler support result_of? +// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that, +// so let's be conservative. #ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) +#if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) #define EIGEN_HAS_STD_RESULT_OF 1 #else #define EIGEN_HAS_STD_RESULT_OF 0 #endif #endif +// Does the compiler support type_traits? +// - full support of type traits was added only to GCC 5.1.0. +// - 20150626 corresponds to the last release of 4.x libstdc++ +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \ + && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \ + && ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -426,23 +574,22 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR + #if defined(EIGEN_CUDACC) + // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above + #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 + #endif + #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \ + (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L))) + #define EIGEN_HAS_CONSTEXPR 1 + #endif -#if defined(__CUDACC__) -// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 -#endif -#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \ - (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L))) -#define EIGEN_HAS_CONSTEXPR 1 -#endif + #ifndef EIGEN_HAS_CONSTEXPR + #define EIGEN_HAS_CONSTEXPR 0 + #endif -#ifndef EIGEN_HAS_CONSTEXPR -#define EIGEN_HAS_CONSTEXPR 0 -#endif - -#endif +#endif // EIGEN_HAS_CONSTEXPR // Does the compiler support C++11 math? // Let's be conservative and enable the default C++11 implementation only if we are sure it exists @@ -480,15 +627,42 @@ #endif #endif -/** Allows to disable some optimizations which might affect the accuracy of the result. - * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. - * They currently include: - * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. - */ -#ifndef EIGEN_FAST_MATH -#define EIGEN_FAST_MATH 1 +#ifndef EIGEN_HAS_CXX11_ATOMIC + #if EIGEN_MAX_CPP_VER>=11 && \ + (__has_feature(cxx_atomic) \ + || (__cplusplus > 201103L) \ + || ((__cplusplus >= 201103L) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) + #define EIGEN_HAS_CXX11_ATOMIC 1 + #else + #define EIGEN_HAS_CXX11_ATOMIC 0 + #endif #endif +#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR + // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules + #if defined(__NVCC__) + // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr + #ifdef __CUDACC_RELAXED_CONSTEXPR__ + #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif + // See bug 1580: clang/CUDA fails to make the following calls + // to constexpr bool std::equal_to::operator() even when + // EIGEN_CONSTEXPR_ARE_DEVICE_FUNC is defined in c++14 only. + // #elif defined(__clang__) && defined(__CUDA__) && EIGEN_HAS_CONSTEXPR == 1 + // // clang++ always considers constexpr functions as implicitly __host__ __device__ + // #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC + #endif +#endif + + +//------------------------------------------------------------------------------------------ +// Preprocessor programming helpers +//------------------------------------------------------------------------------------------ + +// This macro can be used to prevent from macro expansion, e.g.: +// std::max EIGEN_NOT_A_MACRO(a,b) +#define EIGEN_NOT_A_MACRO + #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; // concatenate two tokens @@ -504,11 +678,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. +#ifndef EIGEN_STRONG_INLINE #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, @@ -538,12 +714,42 @@ #define EIGEN_PERMISSIVE_EXPR #endif +// GPU stuff + +// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC) +#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC) + // Do not try asserts on device code + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif +#endif + +// All functions callable from CUDA/HIP code must be qualified with __device__ +#ifdef EIGEN_GPUCC + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ +#else + #define EIGEN_DEVICE_FUNC +#endif + + // this macro allows to get rid of linking errors about multiply defined functions. // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG @@ -649,169 +855,6 @@ namespace Eigen { # define EIGEN_CONST_CONDITIONAL(cond) cond #endif -//------------------------------------------------------------------------------------------ -// Static and dynamic alignment control -// -// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES -// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. -// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, -// a default value is automatically computed based on architecture, compiler, and OS. -// -// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} -// to be used to declare statically aligned buffers. -//------------------------------------------------------------------------------------------ - - -/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. - * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, - * so that vectorization doesn't affect binary compatibility. - * - * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link - * vectorized and non-vectorized code. - */ -#if (defined __CUDACC__) - #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) -#else - #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler -#endif - -// If the user explicitly disable vectorization, then we also disable alignment -#if defined(EIGEN_DONT_VECTORIZE) - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 -#elif defined(EIGEN_VECTORIZE_AVX512) - // 64 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 -#elif defined(__AVX__) - // 32 bytes static alignmeent is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 -#else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 -#endif - - -// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense -#define EIGEN_MIN_ALIGN_BYTES 16 - -// Defined the boundary (in bytes) on which the data needs to be aligned. Note -// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be -// aligned at all regardless of the value of this #define. - -#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. -#endif - -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated -// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 -#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) - #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES - #undef EIGEN_MAX_STATIC_ALIGN_BYTES - #endif - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 -#endif - -#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES - - // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - - // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable - // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always - // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in - // certain common platform (compiler+architecture combinations) to avoid these problems. - // Only static alignment is really problematic (relies on nonstandard compiler extensions), - // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #else - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 - #endif - - // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ - && !EIGEN_COMP_SUNCC \ - && !EIGEN_OS_QNX - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 - #else - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 - #endif - - #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT - #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES - #else - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 - #endif - -#endif - -// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES -#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) -// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). -// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. - - -// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY -#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) -#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) -#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) -#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) -#else -#define EIGEN_ALIGN_MAX -#endif - - -// Dynamic alignment control - -#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 -#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. -#endif - -#ifdef EIGEN_DONT_ALIGN - #ifdef EIGEN_MAX_ALIGN_BYTES - #undef EIGEN_MAX_ALIGN_BYTES - #endif - #define EIGEN_MAX_ALIGN_BYTES 0 -#elif !defined(EIGEN_MAX_ALIGN_BYTES) - #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#endif - -#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#else -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES -#endif - - -#ifndef EIGEN_UNALIGNED_VECTORIZE -#define EIGEN_UNALIGNED_VECTORIZE 1 -#endif - -//---------------------------------------------------------------------- - - #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif @@ -819,10 +862,6 @@ namespace Eigen { #define EIGEN_RESTRICT __restrict #endif -#ifndef EIGEN_STACK_ALLOCATION_LIMIT -// 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 -#endif #ifndef EIGEN_DEFAULT_IO_FORMAT #ifdef EIGEN_MAKING_DOCS @@ -837,7 +876,20 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || defined(__CUDACC_VER__)) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) + +// When compiling CUDA/HIP device code with NVCC or HIPCC +// pull in math functions from the global namespace. +// In host mode, and when device code is compiled with clang, +// use the std versions. +#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; +#endif + + +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) + // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -955,7 +1007,7 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT<=1600 +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X @@ -984,15 +1036,23 @@ namespace Eigen { EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) + #define EIGEN_EXCEPTIONS +#endif + + #ifdef EIGEN_EXCEPTIONS # define EIGEN_THROW_X(X) throw X # define EIGEN_THROW throw # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef __CUDA_ARCH__ +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() @@ -1012,7 +1072,38 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# if EIGEN_COMP_MSVC + // MSVC does not support exception specifications (warning C4290), + // and they are deprecated in c++11 anyway. +# define EIGEN_EXCEPTION_SPEC(X) throw() +# else +# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# endif +#endif + +#if EIGEN_HAS_VARIADIC_TEMPLATES +// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. +namespace Eigen { +namespace internal { + +inline bool all(){ return true; } + +template +bool all(T t, Ts ... ts){ return t && all(ts...); } + +} +} +#endif + +// Wrapping #pragma unroll in a macro since it is required for SYCL +#if defined(__SYCL_DEVICE_ONLY__) + #if defined(_MSC_VER) + #define EIGEN_UNROLL_LOOP __pragma(unroll) + #else + #define EIGEN_UNROLL_LOOP _Pragma("unroll") + #endif +#else + #define EIGEN_UNROLL_LOOP #endif #endif // EIGEN_MACROS_H diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 7d9053496..9dd2e0252 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,20 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // new int[huge]; + #else + ::operator new(huge); + #endif #endif } @@ -83,11 +96,12 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -inline void* handmade_aligned_malloc(std::size_t size) +inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { - void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); + eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2"); + void *original = std::malloc(size+alignment); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -156,9 +170,15 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + result = ::malloc(size); + #else result = std::malloc(size); + #endif + #if EIGEN_DEFAULT_ALIGN_BYTES==16 - eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); #endif #else result = handmade_aligned_malloc(size); @@ -174,7 +194,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::free(ptr); + #else std::free(ptr); + #endif + #else handmade_aligned_free(ptr); #endif @@ -218,7 +244,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + void *result = ::malloc(size); + #else void *result = std::malloc(size); + #endif + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +263,11 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::free(ptr); + #else std::free(ptr); + #endif } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -493,7 +528,11 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::memcpy(target, start, size); + #else + std::memcpy(target, start, size); + #endif } }; @@ -542,7 +581,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#ifndef EIGEN_ALLOCA +#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -550,6 +589,15 @@ template struct smart_memmove_helper { #endif #endif +// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is +// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because +// the compiler still emits bad code because stack allocation checks use "<=". +// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 +// is fixed. +#if defined(__clang__) && defined(__thumb__) + #undef EIGEN_ALLOCA +#endif + // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template class aligned_stack_memory_handler : noncopyable @@ -561,12 +609,14 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ + EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } + EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) @@ -580,6 +630,60 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; +#ifdef EIGEN_ALLOCA + +template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic + > +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = false; + typedef typename Xpr::Scalar Scalar; + typedef typename nested_eval::type ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) + { + EIGEN_UNUSED_VARIABLE(ptr); + eigen_internal_assert(ptr==0); + } +}; + +template +struct local_nested_eval_wrapper +{ + static const bool NeedExternalBuffer = true; + typedef typename Xpr::Scalar Scalar; + typedef typename plain_object_eval::type PlainObject; + typedef Map ObjectType; + ObjectType object; + + EIGEN_DEVICE_FUNC + local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) + : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), + m_deallocate(ptr==0) + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::construct_elements_of_array(object.data(), object.size()); + object = xpr; + } + + EIGEN_DEVICE_FUNC + ~local_nested_eval_wrapper() + { + if(NumTraits::RequireInitialization && object.data()) + Eigen::internal::destruct_elements_of_array(object.data(), object.size()); + if(m_deallocate) + Eigen::internal::aligned_free(object.data()); + } + +private: + bool m_deallocate; +}; + +#endif // EIGEN_ALLOCA + template class scoped_array : noncopyable { T* m_ptr; @@ -607,9 +711,11 @@ template void swap(scoped_array &a,scoped_array &b) } // end namespace internal /** \internal - * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. + * + * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, + * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -620,6 +726,14 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. + * + * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to + * \code + * typename internal::nested_eval::type NAME(XPR); + * \endcode + * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. + * This is accomplished through alloca if this later is supported and if the required number of bytes + * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA @@ -639,6 +753,13 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) + + #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ + Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ + ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ + ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ + typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) + #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ @@ -646,6 +767,9 @@ template void swap(scoped_array &a,scoped_array &b) TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) + +#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR) + #endif @@ -688,15 +812,27 @@ template void swap(scoped_array &a,scoped_array &b) #endif #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool( \ + ((Size)!=Eigen::Dynamic) && \ + (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES )==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ + ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) /****************************************************************************/ /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 90eda6e70..1e4f95581 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,9 +11,18 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(__CUDA_ARCH__) -#include -#include +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + #endif #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L @@ -98,6 +107,8 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; #if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; #else template struct is_integral { enum { value = false }; }; @@ -111,8 +122,33 @@ template<> struct is_integral { enum { value = true }; } template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#endif #endif +#if EIGEN_HAS_CXX11 +using std::make_unsigned; +#else +// TODO: Possibly improve this implementation of make_unsigned. +// It is currently used only by +// template struct random_default_impl. +template struct make_unsigned; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +#if EIGEN_COMP_MSVC +template<> struct make_unsigned { typedef unsigned __int64 type; }; +template<> struct make_unsigned { typedef unsigned __int64 type; }; +#endif +#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -139,16 +175,19 @@ private: struct yes {int a[1];}; struct no {int a[2];}; - static yes test(const To&, int); + template + static yes test(T, int); + + template static no test(any_conversion, ...); public: - static From ms_from; + static typename internal::remove_reference::type* ms_from; #ifdef __INTEL_COMPILER #pragma warning push #pragma warning ( disable : 2259 ) #endif - enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; + enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; #ifdef __INTEL_COMPILER #pragma warning pop #endif @@ -157,8 +196,7 @@ public: template struct is_convertible { - enum { value = is_convertible_impl::type, - typename remove_all::type>::value }; + enum { value = is_convertible_impl::value }; }; /** \internal Allows to enable/disable an overload @@ -169,7 +207,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -191,13 +229,31 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { @@ -208,9 +264,21 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { @@ -272,7 +340,7 @@ template<> struct numeric_limits #endif /** \internal - * A base class do disable default copy ctor and copy assignement operator. + * A base class do disable default copy ctor and copy assignment operator. */ class noncopyable { @@ -433,10 +501,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(typename C::ReturnType const *); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; }; template const T* return_ptr(); @@ -522,14 +590,14 @@ template struct scalar_product_traits } // end namespace internal namespace numext { - -#if defined(__CUDA_ARCH__) + +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_GPU_COMPILE_PHASE) using internal::device::numeric_limits; #else using std::numeric_limits; @@ -538,11 +606,36 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template +EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; } +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const X& x,const Y& y) { return x == y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } +#endif + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } +#endif + } // end namespace numext } // end namespace Eigen diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index 86b60f52f..e23a128d1 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -1,4 +1,8 @@ -#ifdef EIGEN_WARNINGS_DISABLED +#ifdef EIGEN_WARNINGS_DISABLED_2 +// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet! +# undef EIGEN_WARNINGS_DISABLED_2 + +#elif defined(EIGEN_WARNINGS_DISABLED) #undef EIGEN_WARNINGS_DISABLED #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS @@ -8,7 +12,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && __GNUC__>=6 + #elif defined __GNUC__ #pragma GCC diagnostic pop #endif diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h index 983361a45..500e47792 100644 --- a/Eigen/src/Core/util/StaticAssert.h +++ b/Eigen/src/Core/util/StaticAssert.h @@ -24,6 +24,7 @@ * */ +#ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) @@ -44,64 +45,65 @@ struct static_assertion { enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, - OUT_OF_RANGE_ACCESS, - YOU_MADE_A_PROGRAMMING_MISTAKE, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, - NUMERIC_TYPE_MUST_BE_REAL, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, - INVALID_MATRIX_PRODUCT, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, - INVALID_MATRIX_TEMPLATE_PARAMETERS, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, - YOU_ALREADY_SPECIFIED_THIS_STRIDE, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, - THIS_TYPE_IS_NOT_SUPPORTED, - STORAGE_KIND_MUST_MATCH, - STORAGE_INDEX_MUST_MATCH, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, + OUT_OF_RANGE_ACCESS=1, + YOU_MADE_A_PROGRAMMING_MISTAKE=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, + NUMERIC_TYPE_MUST_BE_REAL=1, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, + INVALID_MATRIX_PRODUCT=1, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, + INVALID_MATRIX_TEMPLATE_PARAMETERS=1, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, + YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, + STORAGE_LAYOUT_DOES_NOT_MATCH=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, + THIS_TYPE_IS_NOT_SUPPORTED=1, + STORAGE_KIND_MUST_MATCH=1, + STORAGE_INDEX_MUST_MATCH=1, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 }; }; @@ -131,7 +133,7 @@ #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); #endif // EIGEN_NO_STATIC_ASSERT - +#endif // EIGEN_STATIC_ASSERT // static assertion failing if the type \a TYPE is not a vector type #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ diff --git a/Eigen/src/Core/util/SymbolicIndex.h b/Eigen/src/Core/util/SymbolicIndex.h index bb6349eb9..17cf46f05 100644 --- a/Eigen/src/Core/util/SymbolicIndex.h +++ b/Eigen/src/Core/util/SymbolicIndex.h @@ -12,7 +12,7 @@ namespace Eigen { -/** \namespace Eigen::Symbolic +/** \namespace Eigen::symbolic * \ingroup Core_Module * * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index. @@ -20,9 +20,9 @@ namespace Eigen { * * \code * // First step, defines symbols: - * struct x_tag {}; static const Symbolic::SymbolExpr x; - * struct y_tag {}; static const Symbolic::SymbolExpr y; - * struct z_tag {}; static const Symbolic::SymbolExpr z; + * struct x_tag {}; static const symbolic::SymbolExpr x; + * struct y_tag {}; static const symbolic::SymbolExpr y; + * struct z_tag {}; static const symbolic::SymbolExpr z; * * // Defines an expression: * auto expr = (x+3)/y+z; @@ -35,10 +35,10 @@ namespace Eigen { * std::cout << expr98.eval(x=6) << "\n"; * \endcode * - * It is currently only used internally to define and minipulate the placeholders::last and placeholders::end symbols in Eigen::seq and Eigen::seqN. + * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. * */ -namespace Symbolic { +namespace symbolic { template class Symbol; template class NegateExpr; @@ -187,17 +187,10 @@ public: template struct is_symbolic { - // BaseExpr has no conversion ctor, so we only have to check whether T can be staticaly cast to its base class BaseExpr. + // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr. enum { value = internal::is_convertible >::value }; }; -// Specialization for functions, because is_convertible fails in this case. -// Useful in c++98/11 mode when testing is_symbolic)> -template -struct is_symbolic { - enum { value = false }; -}; - /** Represents the actual value of a symbol identified by its tag * * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used. @@ -293,7 +286,7 @@ protected: Arg1 m_arg1; }; -} // end namespace Symbolic +} // end namespace symbolic } // end namespace Eigen diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 4b337f29f..836ff4711 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,26 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template struct is_valid_index_type +{ + enum { value = +#if EIGEN_HAS_TYPE_TRAITS + internal::is_integral::value || std::is_enum::value +#elif EIGEN_COMP_MSVC + internal::is_integral::value || __is_enum(T) +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value +#endif + }; +}; + +// true if both types are not valid index types +template +struct valid_indexed_view_overload { + enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar @@ -385,7 +405,7 @@ template struct plain_matrix_type_row_major typedef Matrix::Scalar, Rows, Cols, - (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, + (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor, MaxRows, MaxCols > type; @@ -440,7 +460,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -656,17 +676,32 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; + +template struct is_identity +{ enum { value = false }; }; + +template struct is_identity, T> > +{ enum { value = true }; }; + + template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) +struct possibly_same_dense { + enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; +}; + +template +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) +EIGEN_DEVICE_FUNC +bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) { return false; } diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index dc5fae06a..081e918f1 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919f7..b8b3490c6 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/EigenSolver.h b/Eigen/src/Eigenvalues/EigenSolver.h index f205b185d..997bebe7b 100644 --- a/Eigen/src/Eigenvalues/EigenSolver.h +++ b/Eigen/src/Eigenvalues/EigenSolver.h @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 36a91dffc..87d789b3f 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -311,7 +311,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp // Aliases: Map v(reinterpret_cast(m_tmp.data()), size); ComplexVectorType &cv = m_tmp; - const MatrixType &mZ = m_realQZ.matrixZ(); const MatrixType &mS = m_realQZ.matrixS(); const MatrixType &mT = m_realQZ.matrixT(); @@ -351,7 +350,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } } } - m_eivec.col(i).real().noalias() = mZ.transpose() * v; + m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v; m_eivec.col(i).real().normalize(); m_eivec.col(i).imag().setConstant(0); } @@ -400,7 +399,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp / (alpha*mT.coeffRef(j,j) - static_cast(beta*mS.coeffRef(j,j))); } } - m_eivec.col(i+1).noalias() = (mZ.transpose() * cv); + m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv); m_eivec.col(i+1).normalize(); m_eivec.col(i) = m_eivec.col(i+1).conjugate(); } diff --git a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index 5f6bb8289..d0f9091be 100644 --- a/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * Accoring to \p options, this function computes eigenvalues and (if requested) + * According to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h index f647f69b0..d947dac4e 100644 --- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0a..66e5a3dbb 100644 --- a/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -66,7 +66,6 @@ template inline typename MatrixBase::EigenvaluesReturnType MatrixBase::eigenvalues() const { - typedef typename internal::traits::Scalar Scalar; return internal::eigenvalues_selector::IsComplex>::run(derived()); } @@ -85,10 +84,9 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -inline typename SelfAdjointView::EigenvaluesReturnType +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { - typedef typename SelfAdjointView::PlainObject PlainObject; PlainObject thisAsMatrix(*this); return SelfAdjointEigenSolver(thisAsMatrix, false).eigenvalues(); } @@ -149,7 +147,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -inline typename SelfAdjointView::RealScalar +EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/Eigen/src/Eigenvalues/RealQZ.h b/Eigen/src/Eigenvalues/RealQZ.h index b3a910dd9..e2b37f40e 100644 --- a/Eigen/src/Eigenvalues/RealQZ.h +++ b/Eigen/src/Eigenvalues/RealQZ.h @@ -161,7 +161,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index f5c86041d..aca8a8279 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -270,8 +270,13 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); + // Step 2. Reduce to real Schur form + // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg + // to be able to pass our working-space buffer for the Householder to Dense evaluation. + m_workspaceVector.resize(matrix.cols()); + if(computeU) + m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); + computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); m_matT *= scale; @@ -284,13 +289,13 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa using std::abs; m_matT = matrixH; - if(computeU) + m_workspaceVector.resize(m_matT.cols()); + if(computeU && !internal::is_same_dense(m_matU,matrixQ)) m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows(); - m_workspaceVector.resize(m_matT.cols()); Scalar* workspace = &m_workspaceVector.coeffRef(0); // The matrix m_matT is divided in three parts. @@ -303,7 +308,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); - if(norm!=0) + if(norm!=Scalar(0)) { while (iu >= 0) { @@ -327,7 +332,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa else // No convergence yet { // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG ) - Vector3s firstHouseholderVector(0,0,0), shiftInfo; + Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo; computeShift(iu, iter, exshift, shiftInfo); iter = iter + 1; totalIter = totalIter + 1; diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9ddd553f2..f95606206 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,7 +20,9 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; + template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -119,6 +121,7 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), + m_info(InvalidInput), m_isInitialized(false) { } @@ -337,7 +340,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, \c NoConvergence otherwise. + * \returns \c Success if computation was successful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const @@ -354,7 +357,8 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static void check_template_parameters() + static EIGEN_DEVICE_FUNC + void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -403,7 +407,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver const InputType &matrix(a_matrix.derived()); - using std::abs; + EIGEN_USING_STD_MATH(abs); eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -479,9 +483,10 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template +EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - using std::abs; + EIGEN_USING_STD_MATH(abs); ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -535,7 +540,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - std::swap(diag[i], diag[k+i]); + numext::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -605,7 +610,8 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD_MATH(abs); + EIGEN_USING_STD_MATH(sqrt); Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -616,8 +622,8 @@ template struct direct_selfadjoint_eigenvaluesn1) res = c0/std::sqrt(n0); - else res = c1/std::sqrt(n1); + if(n0>n1) res = c0/sqrt(n0); + else res = c1/sqrt(n1); return true; } @@ -719,7 +725,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - using std::sqrt; + EIGEN_USING_STD_MATH(sqrt); const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; @@ -807,7 +813,7 @@ template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - using std::abs; + EIGEN_USING_STD_MATH(abs); RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index 3891cf883..b0c947dc0 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -37,7 +37,7 @@ namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \ +#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \ template<> template inline \ SelfAdjointEigenSolver >& \ SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ @@ -47,7 +47,7 @@ SelfAdjointEigenSolver >::compute(c && (options&EigVecMask)!=EigVecMask \ && "invalid option parameter"); \ bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \ - lapack_int n = internal::convert_index(matrix.cols()), lda, matrix_order, info; \ + lapack_int n = internal::convert_index(matrix.cols()), lda, info; \ m_eivalues.resize(n,1); \ m_subdiag.resize(n-1); \ m_eivec = matrix; \ @@ -63,27 +63,24 @@ SelfAdjointEigenSolver >::compute(c } \ \ lda = internal::convert_index(m_eivec.outerStride()); \ - matrix_order=LAPACKE_COLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ \ - info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ + info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ m_info = (info==0) ? Success : NoConvergence; \ m_isInitialized = true; \ m_eigenvectorsOk = computeEigenvectors; \ return *this; \ } +#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, ColMajor, LAPACK_COL_MAJOR) - -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, RowMajor, LAPACK_ROW_MAJOR) +EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev) +EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev) +EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev) +EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev) } // end namespace Eigen diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 1d102c17b..c5c1acf46 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -25,6 +25,7 @@ struct traits > }; template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -344,6 +345,7 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -424,6 +426,7 @@ struct tridiagonalization_inplace_selector; * \sa class Tridiagonalization */ template +EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); @@ -439,7 +442,8 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { CoeffVectorType hCoeffs(mat.cols()-1); tridiagonalization_inplace(mat,hCoeffs); @@ -508,7 +512,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + static EIGEN_DEVICE_FUNC + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/Eigen/src/Geometry/AngleAxis.h b/Eigen/src/Geometry/AngleAxis.h index 0af3c1b08..83ee1be46 100644 --- a/Eigen/src/Geometry/AngleAxis.h +++ b/Eigen/src/Geometry/AngleAxis.h @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } diff --git a/Eigen/src/Geometry/Hyperplane.h b/Eigen/src/Geometry/Hyperplane.h index 05929b299..cebe03557 100644 --- a/Eigen/src/Geometry/Hyperplane.h +++ b/Eigen/src/Geometry/Hyperplane.h @@ -119,7 +119,7 @@ public: * If the dimension of the ambient space is greater than 2, then there isn't uniqueness, * so an arbitrary choice is made. */ - // FIXME to be consitent with the rest this could be implemented as a static Through function ?? + // FIXME to be consistent with the rest this could be implemented as a static Through function ?? EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine& parametrized) { normal() = parametrized.direction().unitOrthogonal(); diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index f6ef1bcf6..faea62f17 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } @@ -271,6 +276,27 @@ public: EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } +#if EIGEN_HAS_RVALUE_REFERENCES + // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. + /** Default move constructor */ + EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) + : m_coeffs(std::move(other.coeffs())) + {} + + /** Default move assignment operator */ + EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) + { + m_coeffs = std::move(other.coeffs()); + return *this; + } + + // And now because we declared a constructor, we don't get an implicit copy constructor. Say we want one. + /** Default copy constructor */ + EIGEN_DEVICE_FUNC Quaternion(const Quaternion& other) + : m_coeffs(other.coeffs()) + {} +#endif + EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); template @@ -423,7 +449,7 @@ typedef Map, Aligned> QuaternionMapAlignedd; // Generic Quaternion * Quaternion product // This product can be specialized for a given architecture via the Arch template argument. namespace internal { -template struct quat_product +template struct quat_product { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& a, const QuaternionBase& b){ return Quaternion @@ -446,8 +472,7 @@ QuaternionBase::operator* (const QuaternionBase& other) c EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) return internal::quat_product::Scalar, - EIGEN_PLAIN_ENUM_MIN(internal::traits::Alignment, internal::traits::Alignment)>::run(*this, other); + typename internal::traits::Scalar>::run(*this, other); } /** \sa operator*(Quaternion) */ @@ -624,7 +649,7 @@ EIGEN_DEVICE_FUNC Quaternion Quaternion::UnitRan const Scalar u1 = internal::random(0, 1), u2 = internal::random(0, 2*EIGEN_PI), u3 = internal::random(0, 2*EIGEN_PI); - const Scalar a = sqrt(1 - u1), + const Scalar a = sqrt(Scalar(1) - u1), b = sqrt(u1); return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3)); } @@ -672,7 +697,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> // Generic conjugate of a Quaternion namespace internal { -template struct quat_conj +template struct quat_conj { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& q){ return Quaternion(q.w(),-q.x(),-q.y(),-q.z()); @@ -691,8 +716,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> QuaternionBase::conjugate() const { return internal::quat_conj::Scalar, - internal::traits::Alignment>::run(*this); + typename internal::traits::Scalar>::run(*this); } diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index f58ca03d9..df650fda6 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h @@ -29,6 +29,22 @@ namespace Eigen { * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ + +namespace internal +{ + // This helper helps nvcc+MSVC to properly parse this file. + // See bug 1412. + template + struct uniformscaling_times_affine_returntype + { + enum + { + NewMode = int(Mode) == int(Isometry) ? Affine : Mode + }; + typedef Transform type; + }; +} + template class UniformScaling { @@ -60,9 +76,11 @@ public: /** Concatenates a uniform scaling and an affine transformation */ template - inline Transform operator* (const Transform& t) const + inline typename + internal::uniformscaling_times_affine_returntype::type + operator* (const Transform& t) const { - Transform res = t; + typename internal::uniformscaling_times_affine_returntype::type res = t; res.prescale(factor()); return res; } @@ -70,7 +88,7 @@ public: /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template - inline typename internal::plain_matrix_type::type operator* (const MatrixBase& other) const + inline typename Eigen::internal::plain_matrix_type::type operator* (const MatrixBase& other) const { return other * m_factor; } template @@ -110,7 +128,7 @@ public: /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defiend in MatrixBase and not as a friend function +// NOTE this operator is defined in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 2d36dfadf..75991aaed 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -115,7 +115,7 @@ template struct transform_make_affine; * \end{array} \right) \f$ * * Note that for a projective transformation the last row can be anything, - * and then the interpretation of different parts might be sightly different. + * and then the interpretation of different parts might be slightly different. * * However, unlike a plain matrix, the Transform class provides many features * simplifying both its assembly and usage. In particular, it can be composed diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 51d9a82eb..23b19f74f 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -70,18 +70,18 @@ public: /** Constructs and initialize the translation transformation from a vector of translation coefficients */ EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {} - /** \brief Retruns the x-translation by value. **/ + /** \brief Returns the x-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); } - /** \brief Retruns the y-translation by value. **/ + /** \brief Returns the y-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); } - /** \brief Retruns the z-translation by value. **/ + /** \brief Returns the z-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); } - /** \brief Retruns the x-translation as a reference. **/ + /** \brief Returns the x-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); } - /** \brief Retruns the y-translation as a reference. **/ + /** \brief Returns the y-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); } - /** \brief Retruns the z-translation as a reference. **/ + /** \brief Returns the z-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); } EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; } diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SSE.h index 1a86ff837..d4346aa1c 100644 --- a/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -16,34 +16,43 @@ namespace Eigen { namespace internal { template -struct quat_product +struct quat_product { + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { Quaternion res; - const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); - __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstore(&res.x(), - _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), - _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), + const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); + Packet4f a = _a.coeffs().template packet(0); + Packet4f b = _b.coeffs().template packet(0); + Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + pstoret( + &res.x(), + padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), + pmul(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), - _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); + pxor(mask,padd(s1,s2)))); return res; } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet(0))); + pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); return res; } }; @@ -52,6 +61,9 @@ struct quat_conj template struct cross3_impl { + enum { + ResAlignment = traits::type>::Alignment + }; static inline typename plain_matrix_type::type run(const VectorLhs& lhs, const VectorRhs& rhs) { @@ -60,7 +72,7 @@ struct cross3_impl __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); typename plain_matrix_type::type res; - pstore(&res.x(),_mm_sub_ps(mul1,mul2)); + pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); return res; } }; @@ -68,9 +80,14 @@ struct cross3_impl -template -struct quat_product +template +struct quat_product { + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); @@ -78,8 +95,8 @@ struct quat_product Quaternion res; const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); + Packet2d b_xy = _b.coeffs().template packet(0); + Packet2d b_zw = _b.coeffs().template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -97,9 +114,9 @@ struct quat_product t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2))); + pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); #else - pstore(&res.x(), padd(t1, pxor(mask,preverse(t2)))); + pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); #endif /* @@ -111,25 +128,28 @@ struct quat_product t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); + pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); #else - pstore(&res.z(), psub(t1, pxor(mask,preverse(t2)))); + pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); #endif return res; } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128d mask0 = _mm_setr_pd(-0.,-0.); const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet(0))); - pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet(2))); + pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); + pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); return res; } }; diff --git a/Eigen/src/Householder/BlockHouseholder.h b/Eigen/src/Householder/BlockHouseholder.h index 01a7ed188..39ce1c2a0 100644 --- a/Eigen/src/Householder/BlockHouseholder.h +++ b/Eigen/src/Householder/BlockHouseholder.h @@ -63,8 +63,15 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME add .noalias() once the triangular product can work inplace - triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + // FIXME use the following line with .noalias() once the triangular product can work inplace + // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); + for(Index j=nbVecs-1; j>i; --j) + { + typename TriangularFactorType::Scalar z = triFactor(i,j); + triFactor(i,j) = z * triFactor(j,j); + if(nbVecs-j-1>0) + triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); + } } triFactor(i,i) = hCoeffs(i); diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 80de2c305..5bc037f00 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -39,6 +39,7 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -62,6 +63,7 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template +EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -103,13 +105,14 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->cols() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -140,13 +143,14 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() * essential.size() entries + * this->rows() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() */ template template +EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, @@ -160,10 +164,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential.conjugate(); + tmp.noalias() = right * essential; tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.transpose(); + right.noalias() -= tau * tmp * essential.adjoint(); } } diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 3ce0a693d..e62befcb6 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -140,6 +140,22 @@ template class HouseholderS Side > ConjugateReturnType; + typedef HouseholderSequence< + VectorsType, + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + CoeffsType>::type, + Side + > AdjointReturnType; + + typedef HouseholderSequence< + typename internal::conditional::IsComplex, + typename internal::remove_all::type, + VectorsType>::type, + CoeffsType, + Side + > TransposeReturnType; + /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -157,17 +173,19 @@ template class HouseholderS * * \sa setLength(), setShift() */ + EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) { } /** \brief Copy constructor. */ + EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_trans(other.m_trans), + m_reverse(other.m_reverse), m_length(other.m_length), m_shift(other.m_shift) { @@ -177,12 +195,14 @@ template class HouseholderS * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ + EIGEN_DEVICE_FUNC Index cols() const { return rows(); } /** \brief Essential part of a Householder vector. @@ -199,6 +219,7 @@ template class HouseholderS * * \sa setShift(), shift() */ + EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -206,31 +227,39 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - HouseholderSequence transpose() const + TransposeReturnType transpose() const { - return HouseholderSequence(*this).setTrans(!m_trans); + return TransposeReturnType(m_vectors.conjugate(), m_coeffs) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Complex conjugate of the Householder sequence. */ ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setTrans(m_trans) + .setReverseFlag(m_reverse) .setLength(m_length) .setShift(m_shift); } /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - ConjugateReturnType adjoint() const + AdjointReturnType adjoint() const { - return conjugate().setTrans(!m_trans); + return AdjointReturnType(m_vectors, m_coeffs.conjugate()) + .setReverseFlag(!m_reverse) + .setLength(m_length) + .setShift(m_shift); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - ConjugateReturnType inverse() const { return adjoint(); } + AdjointReturnType inverse() const { return adjoint(); } /** \internal */ - template inline void evalTo(DestType& dst) const + template + inline EIGEN_DEVICE_FUNC + void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -239,6 +268,7 @@ template class HouseholderS /** \internal */ template + EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -251,7 +281,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -265,18 +295,26 @@ template class HouseholderS for(Index k = 0; kBlockSize) + { + dst.setIdentity(rows(), rows()); + if(m_reverse) + applyThisOnTheLeft(dst,workspace,true); + else + applyThisOnTheLeft(dst,workspace,true); + } else { dst.setIdentity(rows(), rows()); for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_trans) + if(m_reverse) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); } } } @@ -295,31 +333,34 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? m_length-k-1 : k; + Index actual_k = m_reverse ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } /** \internal */ - template inline void applyThisOnTheLeft(Dest& dst) const + template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const { Matrix workspace; - applyThisOnTheLeft(dst, workspace); + applyThisOnTheLeft(dst, workspace, inputIsIdentity); } /** \internal */ template - inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const + inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const { - const Index BlockSize = 48; + if(inputIsIdentity && m_reverse) + inputIsIdentity = false; // if the entries are large enough, then apply the reflectors by block if(m_length>=BlockSize && dst.cols()>1) { - for(Index i = 0; i < m_length; i+=BlockSize) + // Make sure we have at least 2 useful blocks, otherwise it is point-less: + Index blockSize = m_length class HouseholderS Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); - Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); + + Index dstStart = dst.rows()-rows()+m_shift+k; + Index dstRows = rows()-m_shift-k; + Block sub_dst(dst, + dstStart, + inputIsIdentity ? dstStart : 0, + dstRows, + inputIsIdentity ? dstRows : dst.cols()); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); } } else @@ -338,8 +386,9 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_trans ? k : m_length-k-1; - dst.bottomRows(rows()-m_shift-actual_k) + Index actual_k = m_reverse ? k : m_length-k-1; + Index dstStart = rows()-m_shift-actual_k; + dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } @@ -357,7 +406,7 @@ template class HouseholderS { typename internal::matrix_type_times_scalar_type::Type res(other.template cast::ResultScalar>()); - applyThisOnTheLeft(res); + applyThisOnTheLeft(res, internal::is_identity::value && res.rows()==res.cols()); return res; } @@ -372,6 +421,7 @@ template class HouseholderS * * \sa length() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -389,13 +439,17 @@ template class HouseholderS * * \sa shift() */ + EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } + EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ + + EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ @@ -403,27 +457,30 @@ template class HouseholderS protected: - /** \brief Sets the transpose flag. - * \param [in] trans New value of the transpose flag. + /** \internal + * \brief Sets the reverse flag. + * \param [in] reverse New value of the reverse flag. * - * By default, the transpose flag is not set. If the transpose flag is set, then this object represents - * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * By default, the reverse flag is not set. If the reverse flag is set, then this object represents + * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. + * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. * - * \sa trans() + * \sa reverseFlag(), transpose(), adjoint() */ - HouseholderSequence& setTrans(bool trans) + HouseholderSequence& setReverseFlag(bool reverse) { - m_trans = trans; + m_reverse = reverse; return *this; } - bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ + bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_trans; + bool m_reverse; Index m_length; Index m_shift; + enum { BlockSize = 48 }; }; /** \brief Computes the product of a matrix with a Householder sequence. diff --git a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 358444aff..f66c846ef 100644 --- a/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -152,13 +152,28 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { // Compute the inverse squared-norm of each column of mat m_invdiag.resize(mat.cols()); - for(Index j=0; j0) - m_invdiag(j) = RealScalar(1)/sum; - else - m_invdiag(j) = RealScalar(1); + m_invdiag.setZero(); + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j)); + } + else + { + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/sum; + else + m_invdiag(j) = RealScalar(1); + } } Base::m_isInitialized = true; return *this; diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 395daa8e4..f7ce47134 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = 0; return; } - RealScalar threshold = tol*tol*rhsNorm2; + const RealScalar considerAsZero = (std::numeric_limits::min)(); + RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = sqrt(residualNorm2 / rhsNorm2); return; } - + VectorType p(n); p = precond.solve(residual); // initial search direction diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 338e6f10a..43bd8e8f6 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase::analyzePattern(const _MatrixType& amat) SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); diff --git a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 7c2326eb7..bfeee71cd 100644 --- a/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -275,7 +275,7 @@ public: const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value setted by setMaxIterations or, by default, + * It is either the value set by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h index 0ace45177..79e1e4819 100644 --- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h @@ -108,7 +108,7 @@ struct Assignment, interna } }; -} // end namepsace internal +} // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index d25af8e90..4ccd49a04 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -37,17 +37,20 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ + EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ + EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - Scalar& c() { return m_c; } - Scalar c() const { return m_c; } - Scalar& s() { return m_s; } - Scalar s() const { return m_s; } + EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } + EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } + EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } + EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } /** Concatenates two planar rotation */ + EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -56,20 +59,27 @@ template class JacobiRotation } /** Returns the transposed transformation */ + EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ + EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template + EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); + EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + EIGEN_DEVICE_FUNC + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; }; @@ -80,11 +90,12 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; using std::abs; - typedef typename NumTraits::Real RealScalar; + RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -124,6 +135,7 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template +EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -133,7 +145,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields: * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$. * - * The value of \a z is returned if \a z is not null (the default is null). + * The value of \a r is returned if \a r is not null (the default is null). * Also note that G is built such that the cosine is always real. * * Example: \include Jacobi_makeGivens.cpp @@ -146,14 +158,16 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) +EIGEN_DEVICE_FUNC +void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { - makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); + makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); } // specialization for complexes template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; @@ -213,6 +227,7 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template +EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -264,6 +279,7 @@ namespace internal { * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template +EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } @@ -275,6 +291,7 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -290,6 +307,7 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template +EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -298,12 +316,146 @@ inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { + +template +struct apply_rotation_in_the_plane_selector +{ + static EIGEN_DEVICE_FUNC + inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; + + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + else + { + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i(px); + Packet xi1 = ploadu(px+PacketSize); + Packet yi = pload (py); + Packet yi1 = pload (py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu(x+peelingEnd); + Packet yi = pload (y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } + } + + for(Index i=alignedEnd; i0) // FIXME should be compared to the required alignment + { + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + + /*** non-vectorized path ***/ + else + { + apply_rotation_in_the_plane_selector::run(x,incrx,y,incry,size,c,s); + } + } +}; + template +EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - enum { PacketSize = packet_traits::size }; - typedef typename packet_traits::type Packet; + const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + && (int(packet_traits::size) == int(packet_traits::size)); + eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); Index incrx = xpr_x.derived().innerStride(); @@ -317,113 +469,11 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x if (c==OtherScalar(1) && s==OtherScalar(0)) return; - /*** dynamic-size vectorized paths ***/ - - if(VectorX::SizeAtCompileTime == Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - ((incrx==1 && incry==1) || PacketSize == 1)) - { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; - - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; - - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i(px); - Packet xi1 = ploadu(px+PacketSize); - Packet yi = pload (py); - Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; - } - if(alignedEnd!=peelingEnd) - { - Packet xi = ploadu(x+peelingEnd); - Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - } - } - - for(Index i=alignedEnd; i::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment - { - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - - /*** non-vectorized path ***/ - else - { - for(Index i=0; i::Alignment, evaluator::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal diff --git a/Eigen/src/KLUSupport/KLUSupport.h b/Eigen/src/KLUSupport/KLUSupport.h new file mode 100644 index 000000000..d2633a935 --- /dev/null +++ b/Eigen/src/KLUSupport/KLUSupport.h @@ -0,0 +1,358 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Kyle Macfarlan +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KLUSUPPORT_H +#define EIGEN_KLUSUPPORT_H + +namespace Eigen { + +/* TODO extract L, extract U, compute det, etc... */ + +/** \ingroup KLUSupport_Module + * \brief A sparse LU factorization and solver based on KLU + * + * This class allows to solve for A.X = B sparse linear problems via a LU factorization + * using the KLU library. The sparse matrix A must be squared and full rank. + * The vectors or matrices X and B can be either dense or sparse. + * + * \warning The input matrix A should be in a \b compressed and \b column-major form. + * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. + * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> + * + * \implsparsesolverconcept + * + * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU + */ + + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) { + return klu_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) { + return klu_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); +} + +inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { + return klu_z_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), 0, Common); +} + +inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { + return klu_factor(Ap, Ai, Ax, Symbolic, Common); +} + +inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { + return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); +} + + +template +class KLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef Matrix Vector; + typedef Matrix IntRowVectorType; + typedef Matrix IntColVectorType; + typedef SparseMatrix LUMatrixType; + typedef SparseMatrix KLUMatrixType; + typedef Ref KLUMatrixRef; + enum { + ColsAtCompileTime = MatrixType::ColsAtCompileTime, + MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime + }; + + public: + + KLU() + : m_dummy(0,0), mp_matrix(m_dummy) + { + init(); + } + + template + explicit KLU(const InputMatrixType& matrix) + : mp_matrix(matrix) + { + init(); + compute(matrix); + } + + ~KLU() + { + if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); + if(m_numeric) klu_free_numeric(&m_numeric,&m_common); + } + + inline Index rows() const { return mp_matrix.rows(); } + inline Index cols() const { return mp_matrix.cols(); } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful, + * \c NumericalIssue if the matrix.appears to be negative. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "Decomposition is not initialized."); + return m_info; + } +#if 0 // not implemented yet + inline const LUMatrixType& matrixL() const + { + if (m_extractedDataAreDirty) extractData(); + return m_l; + } + + inline const LUMatrixType& matrixU() const + { + if (m_extractedDataAreDirty) extractData(); + return m_u; + } + + inline const IntColVectorType& permutationP() const + { + if (m_extractedDataAreDirty) extractData(); + return m_p; + } + + inline const IntRowVectorType& permutationQ() const + { + if (m_extractedDataAreDirty) extractData(); + return m_q; + } +#endif + /** Computes the sparse Cholesky decomposition of \a matrix + * Note that the matrix should be column-major, and in compressed format for best performance. + * \sa SparseMatrix::makeCompressed(). + */ + template + void compute(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + grab(matrix.derived()); + analyzePattern_impl(); + factorize_impl(); + } + + /** Performs a symbolic decomposition on the sparcity of \a matrix. + * + * This function is particularly useful when solving for several problems having the same structure. + * + * \sa factorize(), compute() + */ + template + void analyzePattern(const InputMatrixType& matrix) + { + if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); + if(m_numeric) klu_free_numeric(&m_numeric, &m_common); + + grab(matrix.derived()); + + analyzePattern_impl(); + } + + + /** Provides access to the control settings array used by KLU. + * + * See KLU documentation for details. + */ + inline const klu_common& kluCommon() const + { + return m_common; + } + + /** Provides access to the control settings array used by UmfPack. + * + * If this array contains NaN's, the default values are used. + * + * See KLU documentation for details. + */ + inline klu_common& kluCommon() + { + return m_common; + } + + /** Performs a numeric decomposition of \a matrix + * + * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. + * + * \sa analyzePattern(), compute() + */ + template + void factorize(const InputMatrixType& matrix) + { + eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); + if(m_numeric) + klu_free_numeric(&m_numeric,&m_common); + + grab(matrix.derived()); + + factorize_impl(); + } + + /** \internal */ + template + bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; + +#if 0 // not implemented yet + Scalar determinant() const; + + void extractData() const; +#endif + + protected: + + void init() + { + m_info = InvalidInput; + m_isInitialized = false; + m_numeric = 0; + m_symbolic = 0; + m_extractedDataAreDirty = true; + + klu_defaults(&m_common); + } + + void analyzePattern_impl() + { + m_info = InvalidInput; + m_analysisIsOk = false; + m_factorizationIsOk = false; + m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), + const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), + &m_common); + if (m_symbolic) { + m_isInitialized = true; + m_info = Success; + m_analysisIsOk = true; + m_extractedDataAreDirty = true; + } + } + + void factorize_impl() + { + + m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), + m_symbolic, &m_common, Scalar()); + + + m_info = m_numeric ? Success : NumericalIssue; + m_factorizationIsOk = m_numeric ? 1 : 0; + m_extractedDataAreDirty = true; + } + + template + void grab(const EigenBase &A) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A.derived()); + } + + void grab(const KLUMatrixRef &A) + { + if(&(A.derived()) != &mp_matrix) + { + mp_matrix.~KLUMatrixRef(); + ::new (&mp_matrix) KLUMatrixRef(A); + } + } + + // cached data to reduce reallocation, etc. +#if 0 // not implemented yet + mutable LUMatrixType m_l; + mutable LUMatrixType m_u; + mutable IntColVectorType m_p; + mutable IntRowVectorType m_q; +#endif + + KLUMatrixType m_dummy; + KLUMatrixRef mp_matrix; + + klu_numeric* m_numeric; + klu_symbolic* m_symbolic; + klu_common m_common; + mutable ComputationInfo m_info; + int m_factorizationIsOk; + int m_analysisIsOk; + mutable bool m_extractedDataAreDirty; + + private: + KLU(const KLU& ) { } +}; + +#if 0 // not implemented yet +template +void KLU::extractData() const +{ + if (m_extractedDataAreDirty) + { + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + + // get size of the data + int lnz, unz, rows, cols, nz_udiag; + umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); + + // allocate data + m_l.resize(rows,(std::min)(rows,cols)); + m_l.resizeNonZeros(lnz); + + m_u.resize((std::min)(rows,cols),cols); + m_u.resizeNonZeros(unz); + + m_p.resize(rows); + m_q.resize(cols); + + // extract + umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), + m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), + m_p.data(), m_q.data(), 0, 0, 0, m_numeric); + + m_extractedDataAreDirty = false; + } +} + +template +typename KLU::Scalar KLU::determinant() const +{ + eigen_assert(false && "KLU: extractData Not Yet Implemented"); + return Scalar(); +} +#endif + +template +template +bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const +{ + Index rhsCols = b.cols(); + EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); + + x = b; + int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); + + m_info = info!=0 ? Success : NumericalIssue; + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_KLUSUPPORT_H diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index d6a3c1e5a..6af63a6e7 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -23,6 +24,7 @@ inline const typename Derived::Scalar bruteforce_det3_helper } template +EIGEN_DEVICE_FUNC const typename Derived::Scalar bruteforce_det4_helper (const MatrixBase& matrix, int j, int k, int m, int n) { @@ -44,7 +46,8 @@ template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +55,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +64,8 @@ template struct determinant_impl template struct determinant_impl { - static inline typename traits::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,7 +75,8 @@ template struct determinant_impl template struct determinant_impl { - static typename traits::Scalar run(const Derived& m) + static EIGEN_DEVICE_FUNC + typename traits::Scalar run(const Derived& m) { // trick by Martin Costabel to compute 4x4 det with only 30 muls return bruteforce_det4_helper(m,0,1,2,3) @@ -89,6 +95,7 @@ template struct determinant_impl * \returns the determinant of this matrix */ template +EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 03b6af706..344ec8926 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -48,12 +48,12 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template class FullPivLU @@ -320,7 +320,7 @@ template class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits::epsilon() * m_lu.diagonalSize(); + : NumTraits::epsilon() * RealScalar(m_lu.diagonalSize()); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -411,11 +411,9 @@ template class FullPivLU #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 018f99b58..1bab00c01 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -290,6 +290,7 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -332,6 +333,7 @@ struct Assignment, internal::assign_op +EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) @@ -404,7 +406,7 @@ inline void MatrixBase::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index d43961887..bfcd2c95b 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -420,8 +420,8 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { diff --git a/Eigen/src/OrderingMethods/Eigen_Colamd.h b/Eigen/src/OrderingMethods/Eigen_Colamd.h index 933cd564b..67fcad3f7 100644 --- a/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; /* get pivot column from head of minimum degree list */ - while (head [min_score] == COLAMD_EMPTY && min_score < n_col) + while (min_score < n_col && head [min_score] == COLAMD_EMPTY) { min_score++ ; } @@ -1493,7 +1493,7 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guarranteed not to be anymore unordered columns */ + /* guaranteed not to be anymore unordered columns */ /* above an ordered column */ } while (Col [c].shared2.order == COLAMD_EMPTY) ; @@ -1638,7 +1638,7 @@ static void detect_super_cols COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; /* row indices will same order for both supercols, */ - /* no gather scatter nessasary */ + /* no gather scatter necessary */ if (*cp1++ != *cp2++) { break ; @@ -1688,7 +1688,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all avaliable memory has been used while performing row merging. Returns + all available memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. diff --git a/Eigen/src/OrderingMethods/Ordering.h b/Eigen/src/OrderingMethods/Ordering.h index 7ea9b14d7..34dbef487 100644 --- a/Eigen/src/OrderingMethods/Ordering.h +++ b/Eigen/src/OrderingMethods/Ordering.h @@ -31,7 +31,7 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) - it.valueRef() = 0.0; + it.valueRef() = typename MatrixType::Scalar(0); } symmat = C + A; } diff --git a/Eigen/src/PaStiXSupport/PaStiXSupport.h b/Eigen/src/PaStiXSupport/PaStiXSupport.h index d2ebfd7bb..37426877a 100644 --- a/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -64,28 +64,28 @@ namespace internal typedef typename _MatrixType::StorageIndex StorageIndex; }; - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast(vals), perm, invp, reinterpret_cast(x), nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 091c3970e..fb2ba04b4 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -140,7 +140,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/QR/ColPivHouseholderQR.h b/Eigen/src/QR/ColPivHouseholderQR.h index 0e47c8332..1faa3442e 100644 --- a/Eigen/src/QR/ColPivHouseholderQR.h +++ b/Eigen/src/QR/ColPivHouseholderQR.h @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was succesful. + /** \brief Reports whether the QR factorization was successful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. @@ -416,7 +416,6 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif @@ -506,8 +505,8 @@ void ColPivHouseholderQR::computeInPlace() m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k); } - RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); - RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); + RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_maxpivot = RealScalar(0); @@ -553,12 +552,12 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNormsUpdated.coeffRef(j) != 0) { + if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); - temp = temp < 0 ? 0 : temp; - RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / - m_colNormsDirect.coeffRef(j)); + temp = temp < RealScalar(0) ? RealScalar(0) : temp; + RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / + m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { // The updated norm has become too inaccurate so re-compute the column // norm directly. @@ -596,11 +595,7 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) - .setLength(nonzero_pivots) - .transpose() - ); + c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() diff --git a/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 34c637b70..03017a375 100644 --- a/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -353,7 +353,7 @@ class CompleteOrthogonalDecomposition { inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * succesful. + * successful. * * \note This function always returns \c Success. It is provided for * compatibility @@ -367,7 +367,7 @@ class CompleteOrthogonalDecomposition { #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; + void _solve_impl(const RhsType& rhs, DstType& dst) const; #endif protected: @@ -452,7 +452,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -500,11 +500,8 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs - // Note that the matrix Q = H_0^* H_1^*... so its inverse is - // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft( - householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); + c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() diff --git a/Eigen/src/QR/FullPivHouseholderQR.h b/Eigen/src/QR/FullPivHouseholderQR.h index e489bddc2..c31e47cc4 100644 --- a/Eigen/src/QR/FullPivHouseholderQR.h +++ b/Eigen/src/QR/FullPivHouseholderQR.h @@ -392,22 +392,21 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; diff --git a/Eigen/src/QR/HouseholderQR.h b/Eigen/src/QR/HouseholderQR.h index 3513d995c..33cb9c8ff 100644 --- a/Eigen/src/QR/HouseholderQR.h +++ b/Eigen/src/QR/HouseholderQR.h @@ -204,28 +204,27 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -292,7 +291,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h + // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { @@ -354,11 +353,7 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c typename RhsType::PlainObject c(rhs); - // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T - c.applyOnTheLeft(householderSequence( - m_qr.leftCols(rank), - m_hCoeffs.head(rank)).transpose() - ); + c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); m_qr.topLeftCorner(rank, rank) .template triangularView() diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 953d57c9d..1a5c5254e 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -220,7 +220,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 25fca6f4d..4daa9dd21 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014-2016 Gael Guennebaud +// Copyright (C) 2014-2017 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -22,6 +22,11 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS +#ifdef EIGEN_BDCSVD_SANITY_CHECKS +#undef eigen_internal_assert +#define eigen_internal_assert(X) assert(X); +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -57,7 +62,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * @@ -77,6 +82,7 @@ public: typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Literal Literal; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -211,7 +217,7 @@ public: // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -259,7 +265,7 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows RealScalar scale = matrix.cwiseAbs().maxCoeff(); - if(scale==RealScalar(0)) scale = RealScalar(1); + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; else copy = matrix/scale; @@ -351,13 +357,13 @@ void BDCSVD::structured_update(Block A, co Index k1=0, k2=0; for(Index j=0; j::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -449,11 +455,11 @@ void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, l = m_naiveU.row(1).segment(firstCol, k); f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } - if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; + if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1); if (r0::divide (Index firstCol, Index lastCol, Index firstRowW, // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); - diag(0) = 0; + diag(0) = Literal(0); // Allocate space for singular values and vectors singVals.resize(n); @@ -590,7 +596,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==0) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -617,13 +623,11 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec std::cout << " shift: " << shifts.transpose() << "\n"; { - Index actual_n = n; - while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; - std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; + assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); } #endif @@ -651,13 +655,13 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert(U.allFinite()); - assert(V.allFinite()); - assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); - assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); + assert(U.allFinite()); + assert(V.allFinite()); +// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); +// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -672,6 +676,15 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec if(m_compV) V.col(i).swap(V.col(i+1)); } } + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + { + bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); + if(!singular_values_sorted) + std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; + assert(singular_values_sorted); + } +#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -691,11 +704,13 @@ template typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) { Index m = perm.size(); - RealScalar res = 1; + RealScalar res = Literal(1); for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d { using std::abs; using std::swap; + using std::sqrt; Index n = col0.size(); Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above + // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. + while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) { - if (col0(k) == 0 || actual_n==1) + if (col0(k) == Literal(0) || actual_n==1) { // if col0(k) == 0, then entry is deflated, so singular value is on diagonal // if actual_n==1, then the deflated problem is already diagonalized singVals(k) = k==0 ? col0(0) : diag(k); - mus(k) = 0; + mus(k) = Literal(0); shifts(k) = k==0 ? col0(0) : diag(k); continue; } @@ -731,31 +749,36 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d right = (diag(actual_n-1) + col0.matrix().norm()); else { - // Skip deflated singular values + // Skip deflated singular values, + // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. + // This should be equivalent to using perm[] Index l = k+1; - while(col0(l)==0) { ++l; eigen_internal_assert(l 0) ? left : right; + RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); @@ -785,26 +808,29 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate - bool useBisection = fPrev*fCur>0; - while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) + bool useBisection = fPrev*fCur>Literal(0); + while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) { ++m_numIters; // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples. - RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev); + RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev); RealScalar b = fCur - a / muCur; // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); + +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((std::isfinite)(fZero)); +#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; - - if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true; - if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true; + if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; + if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; } @@ -817,37 +843,59 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = (std::numeric_limits::min)(); + // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), + // the factor 2 is to be more conservative + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + + // check that we did it right: + eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: - // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe + // if (k == 0) rightShifted = right - left; else + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * RealScalar(0.6); - rightShifted = -(std::numeric_limits::min)(); + leftShifted = -(right - left) * RealScalar(0.51); + if(k+1( (std::numeric_limits::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + else + rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); -#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE +#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift); #endif +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(!(std::isfinite)(fLeft)) + std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n"; + assert((std::isfinite)(fLeft)); + + if(!(std::isfinite)(fRight)) + std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n"; +// assert((std::isfinite)(fRight)); +#endif + #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(!(fLeft * fRight<0)) { - std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n"; - std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; + std::cout << "f(leftShifted) using leftShifted=" << leftShifted << " ; diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; " + << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n"; + std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " + << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) << " == " << secularEq(right, col0, diag, perm, diag, 0) << "\n"; } #endif - eigen_internal_assert(fLeft * fRight < 0); + eigen_internal_assert(fLeft * fRight < Literal(0)); - while (rightShifted - leftShifted > 2 * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + while (rightShifted - leftShifted > Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) { - RealScalar midShifted = (leftShifted + rightShifted) / 2; + RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); - if (fLeft * fMid < 0) + eigen_internal_assert((numext::isfinite)(fMid)); + + if (fLeft * fMid < Literal(0)) { rightShifted = midShifted; } @@ -858,13 +906,22 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d } } - muCur = (leftShifted + rightShifted) / 2; + muCur = (leftShifted + rightShifted) / Literal(2); } singVals[k] = shift + muCur; shifts[k] = shift; mus[k] = muCur; +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE + if(k+1=singVals[k-1]); + assert(singVals[k]>=diag(k)); +#endif + // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -888,25 +945,53 @@ void BDCSVD::perturbCol0 zhat.setZero(); return; } - Index last = perm(m-1); + Index lastIdx = perm(m-1); // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { - if (col0(k) == 0) // deflated - zhat(k) = 0; + if (col0(k) == Literal(0)) // deflated + zhat(k) = Literal(0); else { // see equation (3.6) RealScalar dk = diag(k); - RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); + RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk)); +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + if(prod<0) { + std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; + std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; + std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; + } + assert(prod>=0); +#endif for(Index l = 0; l=k && (l==0 || l-1>=m)) + { + std::cout << "Error in perturbCol0\n"; + std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; + std::cout << " " <=0); +#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) @@ -915,10 +1000,13 @@ void BDCSVD::perturbCol0 } } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; + std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); - zhat(k) = col0(k) > 0 ? tmp : -tmp; +#ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert((std::isfinite)(tmp)); +#endif + zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; } } } @@ -934,7 +1022,7 @@ void BDCSVD::computeSingVecs for (Index k = 0; k < n; ++k) { - if (zhat(k) == 0) + if (zhat(k) == Literal(0)) { U.col(k) = VectorType::Unit(n+1, k); if (m_compV) V.col(k) = VectorType::Unit(n, k); @@ -947,7 +1035,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - U(n,k) = 0; + U(n,k) = Literal(0); U.col(k).normalize(); if (m_compV) @@ -958,7 +1046,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - V(0,k) = -1; + V(0,k) = Literal(-1); V.col(k).normalize(); } } @@ -971,7 +1059,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -979,15 +1067,15 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index Index start = firstCol + shift; RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); - RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); - if (r == 0) + RealScalar r = numext::hypot(c,s); + if (r == Literal(0)) { - m_computed(start+i, start+i) = 0; + m_computed(start+i, start+i) = Literal(0); return; } m_computed(start,start) = r; - m_computed(start+i, start) = 0; - m_computed(start+i, start+i) = 0; + m_computed(start+i, start) = Literal(0); + m_computed(start+i, start+i) = Literal(0); JacobiRotation J(c/r,-s/r); if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J); @@ -1000,7 +1088,7 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1020,16 +1108,16 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi << m_computed(firstColm + i+1, firstColm+i+1) << " " << m_computed(firstColm + i+2, firstColm+i+2) << "\n"; #endif - if (r==0) + if (r==Literal(0)) { m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j); return; } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); - m_computed(firstColm + j, firstColm) = 0; + m_computed(firstColm + j, firstColm) = Literal(0); JacobiRotation J(c,-s); if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J); @@ -1040,7 +1128,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs; @@ -1053,7 +1141,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); - RealScalar epsilon_coarse = 8 * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); + RealScalar epsilon_coarse = Literal(8) * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert(m_naiveU.allFinite()); @@ -1081,7 +1169,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << " (diag(" << i << ")=" << diag(i) << ")\n"; #endif - col0(i) = 0; + col0(i) = Literal(0); } //condition 4.3 @@ -1101,6 +1189,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; + std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1191,7 +1280,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1))::deflation(Index firstCol, Index lastCol, Index k, Index #endif }//end deflation -#ifndef __CUDACC__ +#if !defined(EIGEN_GPUCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 43488b1e0..1c7c80376 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -610,7 +610,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 50272154f..ff0516f61 100644 --- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -61,9 +61,10 @@ JacobiSVD, ColPiv u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index cc90a3b75..429414797 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -212,7 +212,6 @@ public: #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; #endif diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 0b1460894..997defc47 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); } } @@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, traits::Flags & RowMajorBit> > Y) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename NumTraits::Literal Literal; enum { StorageOrder = traits::Flags & RowMajorBit }; typedef InnerStride ColInnerStride; typedef InnerStride RowInnerStride; @@ -200,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the begining of column k of Y as a temporary vector + // let's use the beginning of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -229,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the begining of column k of X as a temporary vectors + // let's use the beginning of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); @@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, SubMatType A10( A.block(bs,0, brows-bs,bs) ); SubMatType A01( A.block(0,bs, bs,bcols-bs) ); Scalar tmp = A01(bs-1,0); - A01(bs-1,0) = 1; + A01(bs-1,0) = Literal(1); A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint(); A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01; A01(bs-1,0) = tmp; diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f6529..b9ca94bc3 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -101,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 31e06995b..0aa92f8bc 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -5,7 +5,7 @@ /* -NOTE: thes functions vave been adapted from the LDL library: +NOTE: these functions have been adapted from the LDL library: LDL Copyright (c) 2005 by Timothy A. Davis. All Rights Reserved. @@ -122,7 +122,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& for(StorageIndex k = 0; k < size; ++k) { // compute nonzero pattern of kth row of L, in topological order - y[k] = 0.0; // Y(0:k) is now all zero + y[k] = Scalar(0); // Y(0:k) is now all zero StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L @@ -146,12 +146,12 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* compute numerical values kth row of L (a sparse triangular solve) */ RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset; // get D(k,k), apply the shift function, and clear Y(k) - y[k] = 0.0; + y[k] = Scalar(0); for(; top < size; ++top) { Index i = pattern[top]; /* pattern[top:n-1] is pattern of L(:,k) */ Scalar yi = y[i]; /* get and clear Y(i) */ - y[i] = 0.0; + y[i] = Scalar(0); /* the nonzero entry L(k,i) */ Scalar l_ki; diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f2..e0295f2af 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } diff --git a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a29..9db119b67 100644 --- a/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -17,7 +17,9 @@ namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); std::memset(mask,0,sizeof(bool)*rows); @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix RowMajorRhs; + typedef SparseMatrix RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); + typedef SparseMatrix RowMajorLhs; + typedef SparseMatrix RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); + typedef SparseMatrix ColMajorLhs; + typedef SparseMatrix ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); + typedef SparseMatrix ColMajorRhs; + typedef SparseMatrix ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); + typedef SparseMatrix ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); + typedef SparseMatrix ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } }; diff --git a/Eigen/src/SparseCore/SparseAssign.h b/Eigen/src/SparseCore/SparseAssign.h index 18352a847..113463258 100644 --- a/Eigen/src/SparseCore/SparseAssign.h +++ b/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::max)(src.rows(),src.cols())*2); + dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); for (Index j=0; j -typename SparseMatrixBase::InnerVectorReturnType SparseMatrixBase::innerVector(Index outer) -{ return InnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatrixBase::innerVector(Index outer) const -{ return ConstInnerVectorReturnType(derived(), outer); } - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). - */ -template -typename SparseMatrixBase::InnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this - * is col-major (resp. row-major). Read-only. - */ -template -const typename SparseMatrixBase::ConstInnerVectorsReturnType -SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - /** Generic implementation of sparse Block expression. * Real-only. */ diff --git a/Eigen/src/SparseCore/SparseDenseProduct.h b/Eigen/src/SparseCore/SparseDenseProduct.h index 0547db596..f005a18a1 100644 --- a/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/Eigen/src/SparseCore/SparseDenseProduct.h @@ -88,10 +88,11 @@ struct sparse_time_dense_product_impl::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { - evaluator lhsEval(lhs); + LhsEval lhsEval(lhs); for(Index c=0; c::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef typename evaluator::InnerIterator LhsInnerIterator; + typedef evaluator LhsEval; + typedef typename LhsEval::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - evaluator lhsEval(lhs); - for(Index j=0; j1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) { - typename Res::RowXpr res_j(res.row(j)); - for(LhsInnerIterator it(lhsEval,j); it ;++it) - res_j += (alpha*it.value()) * rhs.row(it.index()); + #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads) + for(Index i=0; i::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -604,9 +604,9 @@ class SparseMatrix m_outerIndex = newOuterIndex; if (outerChange > 0) { - StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; + StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; for(Index i=m_outerSize; i template diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index c6b548f11..229449f02 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -87,6 +87,11 @@ template class SparseMatrixBase * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ + NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, + /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, + * and 2 for matrices. + */ + Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -350,18 +355,6 @@ template class SparseMatrixBase const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); } - // inner-vector - typedef Block InnerVectorReturnType; - typedef Block ConstInnerVectorReturnType; - InnerVectorReturnType innerVector(Index outer); - const ConstInnerVectorReturnType innerVector(Index outer) const; - - // set of inner-vectors - typedef Block InnerVectorsReturnType; - typedef Block ConstInnerVectorsReturnType; - InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); - const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; - DenseMatrixType toDense() const { return DenseMatrixType(derived()); diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h index 4cbf68781..c495a7398 100644 --- a/Eigen/src/SparseCore/SparseProduct.h +++ b/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // supress numerical zeros (exact) + * C = (A*B).pruned(); // suppress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 9e39be738..65611b3d4 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -47,6 +47,7 @@ template class SparseSelfAdjointView enum { Mode = _Mode, + TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0), RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime }; @@ -310,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons while (i && i.index() dstT(dst); - internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); + internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); } }; diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 21c419002..88820a48f 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -21,7 +21,8 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res); - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; typedef typename remove_all::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector tempVector(rows); + AmbiVector tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template struct sparse_sparse_product_with_pruning_selector { - typedef typename traits::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; - typedef SparseMatrix ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixLhs; + typedef SparseMatrix RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixRhs; + typedef SparseMatrix RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, rhs, res, tolerance); } diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 19b0fbc9d..05779be68 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index f883ab383..91b6369ac 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * @@ -499,11 +499,8 @@ void SparseLU::factorize(const MatrixType& matrix) eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices"); - typedef typename IndexVector::Scalar StorageIndex; - m_isInitialized = true; - // Apply the column permutation computed in analyzepattern() // m_mat = matrix * m_perm_c.inverse(); m_mat = matrix; @@ -706,8 +703,8 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator typedef typename MappedSupernodalType::Scalar Scalar; explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace( MatrixBase &X) const { @@ -723,8 +720,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL),m_mapU(mapU) { } - Index rows() { return m_mapL.rows(); } - Index cols() { return m_mapL.cols(); } + Index rows() const { return m_mapL.rows(); } + Index cols() const { return m_mapL.cols(); } template void solveInPlace(MatrixBase &X) const { @@ -747,8 +744,9 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } else { + // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } diff --git a/Eigen/src/SparseLU/SparseLU_Memory.h b/Eigen/src/SparseLU/SparseLU_Memory.h index 4dc42e87b..349bfd585 100644 --- a/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accomodate more fill-ins + * Expand the existing storage to accommodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 721e1883b..8583b1b69 100644 --- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -75,12 +75,12 @@ class MappedSuperNodalMatrix /** * Number of rows */ - Index rows() { return m_row; } + Index rows() const { return m_row; } /** * Number of columns */ - Index cols() { return m_col; } + Index cols() const { return m_col; } /** * Return the array of nonzero values packed by column diff --git a/Eigen/src/SparseLU/SparseLU_column_dfs.h b/Eigen/src/SparseLU/SparseLU_column_dfs.h index c98b30e32..5a2c941b4 100644 --- a/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // intialize xprune(jcol-1) + xprune(jcolm1) = istop; // initialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index 95ba7413f..e37c2fe0d 100644 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // agressive vectorization and peeling + // aggressive vectorization and peeling for(Index i=0; i @@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase > Index rank = this->rank(); - // Compute Q^T * b; + // Compute Q^* * b; typename Dest::PlainObject y, b; - y = this->matrixQ().transpose() * B; + y = this->matrixQ().adjoint() * B; b = y; // Solve with the triangular matrix R @@ -604,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue=0; k--) + Index start_k = internal::is_identity::value ? numext::mini(j,diagSize-1) : diagSize-1; + for (Index k = start_k; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); if(tau==Scalar(0)) continue; - tau = tau * m_qr.m_hcoeffs(k); + tau = tau * numext::conj(m_qr.m_hcoeffs(k)); res.col(j) -= tau * m_qr.m_Q.col(k); } } @@ -650,7 +655,7 @@ struct SparseQR_QProduct : ReturnByValue @@ -668,13 +673,14 @@ struct SparseQRMatrixQReturnType : public EigenBase(m_qr,other.derived(),false); } + // To use for operations with the adjoint of Q SparseQRMatrixQTransposeReturnType adjoint() const { return SparseQRMatrixQTransposeReturnType(m_qr); } inline Index rows() const { return m_qr.rows(); } - inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } - // To use for operations with the transpose of Q + inline Index cols() const { return m_qr.rows(); } + // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { return SparseQRMatrixQTransposeReturnType(m_qr); @@ -682,6 +688,7 @@ struct SparseQRMatrixQReturnType : public EigenBase struct SparseQRMatrixQTransposeReturnType { @@ -712,7 +719,7 @@ struct Assignment, internal: typedef typename DstXprType::StorageIndex StorageIndex; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { - typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows()); + typename DstXprType::PlainObject idMat(src.rows(), src.cols()); idMat.setIdentity(); // Sort the sparse householder reflectors if needed const_cast(&src.m_qr)->_sort_matrix_Q(); diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index cf1fedf92..045da7b4d 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -36,7 +36,7 @@ namespace std \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start, iterator end) : deque_base(start, end) {} \ + deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ diff --git a/Eigen/src/StlSupport/StdList.h b/Eigen/src/StlSupport/StdList.h index e1eba4985..8ba3fada0 100644 --- a/Eigen/src/StlSupport/StdList.h +++ b/Eigen/src/StlSupport/StdList.h @@ -35,7 +35,7 @@ namespace std \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start, iterator end) : list_base(start, end) {} \ + list(iterator start_, iterator end_) : list_base(start_, end_) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ diff --git a/Eigen/src/StlSupport/StdVector.h b/Eigen/src/StlSupport/StdVector.h index ec22821d2..9fcf19bce 100644 --- a/Eigen/src/StlSupport/StdVector.h +++ b/Eigen/src/StlSupport/StdVector.h @@ -36,7 +36,7 @@ namespace std \ vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start, iterator end) : vector_base(start, end) {} \ + vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ diff --git a/Eigen/src/SuperLUSupport/SuperLUSupport.h b/Eigen/src/SuperLUSupport/SuperLUSupport.h index 50a69f306..354e33de5 100644 --- a/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat) template MappedSparseMatrix map_superlu(SluMatrix& sluMat) { - eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR - || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC); + eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR) + || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC)); Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow; @@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/Eigen/src/UmfPackSupport/UmfPackSupport.h b/Eigen/src/UmfPackSupport/UmfPackSupport.h index 9568cc1d5..e3a333f80 100644 --- a/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -10,6 +10,16 @@ #ifndef EIGEN_UMFPACKSUPPORT_H #define EIGEN_UMFPACKSUPPORT_H +// for compatibility with super old version of umfpack, +// not sure this is really needed, but this is harmless. +#ifndef SuiteSparse_long +#ifdef UF_long +#define SuiteSparse_long UF_long +#else +#error neither SuiteSparse_long nor UF_long are defined +#endif +#endif + namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ @@ -17,42 +27,85 @@ namespace Eigen { // generic double/complex wrapper functions: -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) + // Defaults +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_defaults(control); } -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_defaults(control); } + +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_defaults(control); } + +// Report info +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) { umfpack_di_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) { umfpack_zi_report_info(control, info);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) +{ umfpack_dl_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, SuiteSparse_long) +{ umfpack_zl_report_info(control, info);} + +// Report status +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) { umfpack_di_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) { umfpack_zi_report_status(control, status);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) +{ umfpack_dl_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, SuiteSparse_long) +{ umfpack_zl_report_status(control, status);} + +// report control +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) { umfpack_di_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) { umfpack_zi_report_control(control);} -inline void umfpack_free_numeric(void **Numeric, double) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) +{ umfpack_dl_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) +{ umfpack_zl_report_control(control);} + +// Free numeric +inline void umfpack_free_numeric(void **Numeric, double, int) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex) +inline void umfpack_free_numeric(void **Numeric, std::complex, int) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double) +inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) +{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; } + +inline void umfpack_free_numeric(void **Numeric, std::complex, SuiteSparse_long) +{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; } + +// Free symbolic +inline void umfpack_free_symbolic(void **Symbolic, double, int) { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex) +inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } +inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) +{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } + +inline void umfpack_free_symbolic(void **Symbolic, std::complex, SuiteSparse_long) +{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } + +// Symbolic inline int umfpack_symbolic(int n_row,int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic, const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) @@ -66,7 +119,21 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); +} +inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, + const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], void **Symbolic, + const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) +{ + return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); +} + +// Numeric inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) @@ -80,7 +147,21 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], + void *Symbolic, void **Numeric, + const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) +{ + return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); +} + +// solve inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[], void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) @@ -95,6 +176,21 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], + double X[], const double B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); +} + +inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], + std::complex X[], const std::complex B[], void *Numeric, + const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) +{ + return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); +} + +// Get Lunz inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) { return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); @@ -105,6 +201,19 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, double) +{ + return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, + SuiteSparse_long *nz_udiag, void *Numeric, std::complex) +{ + return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); +} + +// Get Numeric inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric) { @@ -120,18 +229,45 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); +} -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex Ux[], + SuiteSparse_long P[], SuiteSparse_long Q[], std::complex Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) +{ + double& lx0_real = numext::real_ref(Lx[0]); + double& ux0_real = numext::real_ref(Ux[0]); + double& dx0_real = numext::real_ref(Dx[0]); + return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, + Dx?&dx0_real:0,0,do_recip,Rs,Numeric); +} + +// Get Determinant +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) { double& mx_real = numext::real_ref(*Mx); return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } +inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); +} + +inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) +{ + double& mx_real = numext::real_ref(*Mx); + return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); +} + /** \ingroup UmfPackSupport_Module * \brief A sparse LU factorization and solver based on UmfPack @@ -164,7 +300,7 @@ class UmfPackLU : public SparseSolverBase > typedef Matrix IntRowVectorType; typedef Matrix IntColVectorType; typedef SparseMatrix LUMatrixType; - typedef SparseMatrix UmfpackMatrixType; + typedef SparseMatrix UmfpackMatrixType; typedef Ref UmfpackMatrixRef; enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -192,8 +328,8 @@ class UmfPackLU : public SparseSolverBase > ~UmfPackLU() { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex()); } inline Index rows() const { return mp_matrix.rows(); } @@ -201,7 +337,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was succesful, + * \returns \c Success if computation was successful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -241,8 +377,8 @@ class UmfPackLU : public SparseSolverBase > template void compute(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); analyzePattern_impl(); factorize_impl(); @@ -257,8 +393,8 @@ class UmfPackLU : public SparseSolverBase > template void analyzePattern(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -309,7 +445,7 @@ class UmfPackLU : public SparseSolverBase > { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); if(m_numeric) - umfpack_free_numeric(&m_numeric,Scalar()); + umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); grab(matrix.derived()); @@ -322,7 +458,7 @@ class UmfPackLU : public SparseSolverBase > */ void printUmfpackControl() { - umfpack_report_control(m_control.data(), Scalar()); + umfpack_report_control(m_control.data(), Scalar(),StorageIndex()); } /** Prints statistics collected by UmfPack. @@ -332,7 +468,7 @@ class UmfPackLU : public SparseSolverBase > void printUmfpackInfo() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar()); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex()); } /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). @@ -341,7 +477,7 @@ class UmfPackLU : public SparseSolverBase > */ void printUmfpackStatus() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar()); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex()); } /** \internal */ @@ -362,13 +498,13 @@ class UmfPackLU : public SparseSolverBase > m_symbolic = 0; m_extractedDataAreDirty = true; - umfpack_defaults(m_control.data(), Scalar()); + umfpack_defaults(m_control.data(), Scalar(),StorageIndex()); } void analyzePattern_impl() { - m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), &m_symbolic, m_control.data(), m_umfpackInfo.data()); @@ -408,7 +544,7 @@ class UmfPackLU : public SparseSolverBase > // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; - int m_fact_errorCode; + StorageIndex m_fact_errorCode; UmfpackControl m_control; mutable UmfpackInfo m_umfpackInfo; @@ -438,7 +574,7 @@ void UmfPackLU::extractData() const if (m_extractedDataAreDirty) { // get size of the data - int lnz, unz, rows, cols, nz_udiag; + StorageIndex lnz, unz, rows, cols, nz_udiag; umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); // allocate data @@ -464,7 +600,7 @@ template typename UmfPackLU::Scalar UmfPackLU::determinant() const { Scalar det; - umfpack_get_determinant(&det, 0, m_numeric, 0); + umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex()); return det; } @@ -477,7 +613,6 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); - int errorCode; Scalar* x_ptr = 0; Matrix x_tmp; if(x.innerStride()!=1) @@ -489,9 +624,10 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas { if(x.innerStride()==1) x_ptr = &x.col(j).coeffRef(0); - errorCode = umfpack_solve(UMFPACK_A, - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data()); + StorageIndex errorCode = umfpack_solve(UMFPACK_A, + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), + m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/Eigen/src/misc/lapacke.h b/Eigen/src/misc/lapacke.h index 8c7e79b03..3d8e24f5a 100755 --- a/Eigen/src/misc/lapacke.h +++ b/Eigen/src/misc/lapacke.h @@ -43,10 +43,6 @@ #include "lapacke_config.h" #endif -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #include #ifndef lapack_int @@ -108,6 +104,11 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 43615bd56..e928db467 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -21,6 +21,7 @@ typedef CwiseUnaryOp, const Derived> AcosReturn typedef CwiseUnaryOp, const Derived> AsinReturnType; typedef CwiseUnaryOp, const Derived> AtanReturnType; typedef CwiseUnaryOp, const Derived> TanhReturnType; +typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; @@ -335,6 +336,15 @@ cosh() const return CoshReturnType(derived()); } +/** \returns an expression of the coefficient-wise logistic of *this. + */ +EIGEN_DEVICE_FUNC +inline const LogisticReturnType +logistic() const +{ + return LogisticReturnType(derived()); +} + /** \returns an expression of the coefficient-wise inverse of *this. * * Example: \include Cwise_inverse.cpp diff --git a/Eigen/src/plugins/BlockMethods.h b/Eigen/src/plugins/BlockMethods.h index 5caf14469..67fdebc6f 100644 --- a/Eigen/src/plugins/BlockMethods.h +++ b/Eigen/src/plugins/BlockMethods.h @@ -40,6 +40,14 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; +/// \internal inner-vector +typedef Block InnerVectorReturnType; +typedef Block ConstInnerVectorReturnType; + +/// \internal set of inner-vectors +typedef Block InnerVectorsReturnType; +typedef Block ConstInnerVectorsReturnType; + #endif // not EIGEN_PARSED_BY_DOXYGEN /// \returns an expression of a block in \c *this with either dynamic or fixed sizes. @@ -1036,7 +1044,7 @@ inline const typename ConstFixedBlockXpr::Type block(Index startRow /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out /// /// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic /// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: @@ -1053,6 +1061,7 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa block(Index,Index,NRowsType,NColsType), class Block /// template +EIGEN_DEVICE_FUNC inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { @@ -1354,3 +1363,39 @@ inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +InnerVectorReturnType innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +const ConstInnerVectorReturnType innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). +/// +InnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this +/// is col-major (resp. row-major). Read-only. +/// +const ConstInnerVectorsReturnType +innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 22c1666c5..5bfb19ac6 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -53,13 +53,6 @@ ivcSize(const Indices& indices) const { return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); } -template -struct valid_indexed_view_overload { - // Here we use is_convertible to Index instead of is_integral in order to treat enums as Index. - // In c++11 we could use is_integral && is_enum if is_convertible appears to be too permissive. - enum { value = !(internal::is_convertible::value && internal::is_convertible::value) }; -}; - public: #endif @@ -74,7 +67,7 @@ struct EIGEN_INDEXED_VIEW_METHOD_TYPE { // This is the generic version template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsIndexedView, typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST @@ -86,7 +79,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND // The following overload returns a Block<> object template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsBlock, typename internal::traits::type>::BlockType>::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST @@ -104,7 +97,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND // The following overload returns a Scalar template -typename internal::enable_if::value +typename internal::enable_if::value && internal::traits::type>::ReturnAsScalar, CoeffReturnType >::type operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST @@ -114,7 +107,7 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND #if EIGEN_HAS_STATIC_ARRAY_TEMPLATE -// The folowing three overloads are needed to handle raw Index[N] arrays. +// The following three overloads are needed to handle raw Index[N] arrays. template IndexedView::type> @@ -146,7 +139,7 @@ operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&col template typename internal::enable_if< - IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_integral::value)), + IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), IndexedView::type> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -157,7 +150,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template typename internal::enable_if< - (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_integral::value)), + (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), IndexedView::type,IvcIndex> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -168,7 +161,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST template typename internal::enable_if< - (internal::get_compile_time_incr::type>::value==1) && (!internal::is_integral::value) && (!Symbolic::is_symbolic::value), + (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), VectorBlock::value> >::type operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST { @@ -179,7 +172,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST } template -typename internal::enable_if::value, CoeffReturnType >::type +typename internal::enable_if::value, CoeffReturnType >::type operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST { return Base::operator()(internal::eval_expr_given_size(id,size())); @@ -250,6 +243,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. * + * See also this question and its answer for an example of how to duplicate coefficients. + * * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) */ template diff --git a/Eigen/src/plugins/ReshapedMethods.h b/Eigen/src/plugins/ReshapedMethods.h index 2bb0b8623..9aeb7f3ee 100644 --- a/Eigen/src/plugins/ReshapedMethods.h +++ b/Eigen/src/plugins/ReshapedMethods.h @@ -25,7 +25,7 @@ /// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and /// that the other size is passed at compile-time using Eigen::fix as above. /// -/// \sa operator()(placeholders::all), class Reshaped, fix, fix(int) +/// \sa operator()(all), class Reshaped, fix, fix(int) /// template EIGEN_DEVICE_FUNC @@ -50,7 +50,7 @@ reshaped(NRowsType nRows, NColsType nCols) const; /// \sa reshaped() EIGEN_DEVICE_FUNC inline Reshaped -operator()(placeholders::all); +operator()(all); #else diff --git a/README.md b/README.md index 4654a81c3..99c9e2933 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ **Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.** For more information go to http://eigen.tuxfamily.org/. + +For ***pull request*** please only use the official repository at https://bitbucket.org/eigen/eigen. + +For ***bug reports*** and ***feature requests*** go to http://eigen.tuxfamily.org/bz. diff --git a/bench/analyze-blocking-sizes.cpp b/bench/analyze-blocking-sizes.cpp index d563a1d2d..6bc4aca3d 100644 --- a/bench/analyze-blocking-sizes.cpp +++ b/bench/analyze-blocking-sizes.cpp @@ -825,7 +825,7 @@ int main(int argc, char* argv[]) } for (int i = 1; i < argc; i++) { bool arg_handled = false; - // Step 1. Try to match action invokation names. + // Step 1. Try to match action invocation names. for (auto it = available_actions.begin(); it != available_actions.end(); ++it) { if (!strcmp(argv[i], (*it)->invokation_name())) { if (!action) { diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp index 8528c5587..688d99c4a 100644 --- a/bench/bench_gemm.cpp +++ b/bench/bench_gemm.cpp @@ -129,7 +129,7 @@ void matlab_cplx_real(const M& ar, const M& ai, const M& b, M& cr, M& ci) template EIGEN_DONT_INLINE void gemm(const A& a, const B& b, C& c) { - c.noalias() += a * b; + c.noalias() += a * b; } int main(int argc, char ** argv) diff --git a/bench/btl/README b/bench/btl/README index f3f5fb36f..ebed88960 100644 --- a/bench/btl/README +++ b/bench/btl/README @@ -36,7 +36,7 @@ For instance: You can also select a given set of actions defining the environment variable BTL_CONFIG this way: BTL_CONFIG="-a action1{:action2}*" ctest -V -An exemple: +An example: BTL_CONFIG="-a axpy:vector_matrix:trisolve:ata" ctest -V -R eigen2 Finally, if bench results already exist (the bench*.dat files) then they merges by keeping the best for each matrix size. If you want to overwrite the previous ones you can simply add the "--overwrite" option: diff --git a/bench/btl/generic_bench/bench.hh b/bench/btl/generic_bench/bench.hh index 7b7b951b5..0732940d5 100644 --- a/bench/btl/generic_bench/bench.hh +++ b/bench/btl/generic_bench/bench.hh @@ -159,7 +159,7 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){ // bench(size_min,size_max,nb_point); - // Only for small problem size. Otherwize it will be too long + // Only for small problem size. Otherwise it will be too long // bench(size_min,size_max,nb_point); // bench(size_min,size_max,nb_point); diff --git a/bench/btl/generic_bench/utils/size_log.hh b/bench/btl/generic_bench/utils/size_log.hh index 13a3da7a8..68945e7cc 100644 --- a/bench/btl/generic_bench/utils/size_log.hh +++ b/bench/btl/generic_bench/utils/size_log.hh @@ -23,7 +23,7 @@ #include "math.h" // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element are int compatible. template void size_log(const int nb_point, const int size_min, const int size_max, Vector & X) diff --git a/bench/btl/generic_bench/utils/xy_file.hh b/bench/btl/generic_bench/utils/xy_file.hh index 4571bed8f..0492faf09 100644 --- a/bench/btl/generic_bench/utils/xy_file.hh +++ b/bench/btl/generic_bench/utils/xy_file.hh @@ -55,7 +55,7 @@ bool read_xy_file(const std::string & filename, std::vector & tab_sizes, // The Vector class must satisfy the following part of STL vector concept : // resize() method -// [] operator for seting element +// [] operator for setting element // the vector element must have the << operator define using namespace std; diff --git a/bench/btl/libs/ublas/ublas_interface.hh b/bench/btl/libs/ublas/ublas_interface.hh index 95cad5195..f59b7cf2f 100644 --- a/bench/btl/libs/ublas/ublas_interface.hh +++ b/bench/btl/libs/ublas/ublas_interface.hh @@ -100,7 +100,7 @@ public : Y+=coef*X; } - // alias free assignements + // alias free assignments static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){ X.assign(prod(A,B)); diff --git a/bench/eig33.cpp b/bench/eig33.cpp index 47947a9be..f003d8a53 100644 --- a/bench/eig33.cpp +++ b/bench/eig33.cpp @@ -101,7 +101,7 @@ void eigen33(const Matrix& mat, Matrix& evecs, Vector& evals) computeRoots(scaledMat,evals); // compute the eigen vectors - // **here we assume 3 differents eigenvalues** + // **here we assume 3 different eigenvalues** // "optimized version" which appears to be slower with gcc! // Vector base; diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt index 8d53f4ae2..029ba6d6b 100644 --- a/bench/spbench/CMakeLists.txt +++ b/bench/spbench/CMakeLists.txt @@ -38,25 +38,32 @@ if(SUPERLU_FOUND AND BLAS_FOUND) endif() -find_package(Pastix) -find_package(Scotch) -find_package(Metis) -if(PASTIX_FOUND AND BLAS_FOUND) +find_package(PASTIX QUIET COMPONENTS METIS SCOTCH) +# check that the PASTIX found is a version without MPI +find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS + NAMES pastix_nompi.h + HINTS ${PASTIX_INCLUDE_DIRS} +) +if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS) + message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory." + " Because Eigen tests require a version without MPI, we disable the Pastix backend.") +endif() +if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS AND BLAS_FOUND) add_definitions("-DEIGEN_PASTIX_SUPPORT") - include_directories(${PASTIX_INCLUDES}) + include_directories(${PASTIX_INCLUDE_DIRS_DEP}) if(SCOTCH_FOUND) - include_directories(${SCOTCH_INCLUDES}) + include_directories(${SCOTCH_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES}) elseif(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES}) endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${BLAS_LIBRARIES}) -endif(PASTIX_FOUND AND BLAS_FOUND) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) +endif() if(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set (SPARSE_LIBS ${SPARSE_LIBS} ${METIS_LIBRARIES}) add_definitions("-DEIGEN_METIS_SUPPORT") endif(METIS_FOUND) diff --git a/bench/spbench/spbenchsolver.cpp b/bench/spbench/spbenchsolver.cpp index 4acd0039c..2a7351124 100644 --- a/bench/spbench/spbenchsolver.cpp +++ b/bench/spbench/spbenchsolver.cpp @@ -54,7 +54,7 @@ int main(int argc, char ** args) statbuf.close(); } else - std::cerr << "Unable to open the provided file for writting... \n"; + std::cerr << "Unable to open the provided file for writing... \n"; } // Get the maximum number of iterations and the tolerance diff --git a/bench/tensors/README b/bench/tensors/README index 3a5fdbe17..69342cc9c 100644 --- a/bench/tensors/README +++ b/bench/tensors/README @@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu -To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code): +To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code): 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code. -{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc +{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1 2. The host compilation pass that generates the final host binary. -clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl +clang++ -O3 -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o +clang++ -O3 tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl +export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib +3. Run the benchmark +./tensor_benchmark_sycl diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index c2fb3dede..3a640ede4 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -35,6 +35,11 @@ template class BenchmarkSuite { void memcpy(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); @@ -55,7 +60,11 @@ template class BenchmarkSuite { } const TensorMap, Eigen::Aligned> A((int*)a_, sizes); TensorMap, Eigen::Aligned> B(b_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.template cast(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.template cast(); @@ -70,7 +79,6 @@ template class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap, Eigen::Aligned> C(c_, sizes); - StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -93,7 +101,18 @@ template class BenchmarkSuite { const Eigen::DSizes second_quadrant(0, m_/2); const Eigen::DSizes third_quadrant(m_/2, 0); const Eigen::DSizes fourth_quadrant(m_/2, m_/2); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.slice(first_quadrant, quarter_sizes).device(device_) = @@ -118,7 +137,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % k_, 0); @@ -135,7 +158,11 @@ template class BenchmarkSuite { Eigen::array output_size; output_size[0] = n_; TensorMap, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % n_, 1); @@ -158,7 +185,11 @@ template class BenchmarkSuite { Eigen::array shuffle; shuffle[0] = 1; shuffle[1] = 0; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); @@ -186,7 +217,11 @@ template class BenchmarkSuite { paddings[0] = Eigen::IndexPair(0, 0); paddings[1] = Eigen::IndexPair(2, 1); #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.pad(paddings); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.pad(paddings); @@ -216,6 +251,11 @@ template class BenchmarkSuite { Eigen::IndexList, Eigen::type2index<2> > strides; #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.stride(strides); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); @@ -245,6 +285,11 @@ template class BenchmarkSuite { broadcast.set(1, n_); #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.broadcast(broadcast); @@ -261,7 +306,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A * A.constant(static_cast(3.14)) + B * B.constant(static_cast(2.7)); @@ -280,6 +329,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl +for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); +} +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); @@ -297,7 +351,11 @@ template class BenchmarkSuite { const TensorMap, Eigen::Aligned> A(a_, sizes); const TensorMap, Eigen::Aligned> B(b_, sizes); TensorMap, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.exp() + B.log(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.exp() + B.log(); @@ -325,7 +383,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -355,7 +417,11 @@ template class BenchmarkSuite { // optimize the code. Eigen::IndexList> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -375,7 +441,11 @@ template class BenchmarkSuite { Eigen::array output_size; TensorMap, Eigen::Aligned> C( c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(); @@ -404,7 +474,11 @@ template class BenchmarkSuite { typedef typename Tensor::DimensionPair DimPair; Eigen::array dims; dims[0] = DimPair(1, 0); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.contract(B, dims); @@ -430,7 +504,11 @@ template class BenchmarkSuite { Eigen::array dims; dims[0] = 0; dims[1] = 1; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.convolve(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); @@ -461,6 +539,11 @@ template class BenchmarkSuite { if (Eigen::internal::is_same::value) { device_.synchronize(); } +#elif defined(EIGEN_USE_SYCL) + if (Eigen::internal::is_same::value) { + device_.synchronize(); + } + #endif StopBenchmarkTiming(); SetBenchmarkFlopsProcessed(num_items); diff --git a/bench/tensors/tensor_benchmarks_sycl.cc b/bench/tensors/tensor_benchmarks_sycl.cc index 6df190869..cb6daac15 100644 --- a/bench/tensors/tensor_benchmarks_sycl.cc +++ b/bench/tensors/tensor_benchmarks_sycl.cc @@ -1,20 +1,73 @@ -#define EIGEN_USE_SYCL +#ifdef EIGEN_USE_SYCL #include #include #include "tensor_benchmarks.h" -#define BM_FuncGPU(FUNC) \ - static void BM_##FUNC(int iters, int N) { \ - StopBenchmarkTiming(); \ - cl::sycl::gpu_selector selector; \ - Eigen::QueueInterface queue(selector); \ - Eigen::SyclDevice device(&queue); \ - BenchmarkSuite suite(device, N); \ - suite.FUNC(iters); \ - } \ +#define BM_FuncGPU(FUNC) \ + static void BM_##FUNC(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters); \ + } \ BENCHMARK_RANGE(BM_##FUNC, 10, 5000); +BM_FuncGPU(memcpy); +BM_FuncGPU(typeCasting); +BM_FuncGPU(slicing); +BM_FuncGPU(rowChip); +BM_FuncGPU(colChip); +BM_FuncGPU(shuffling); +BM_FuncGPU(padding); +BM_FuncGPU(striding); BM_FuncGPU(broadcasting); BM_FuncGPU(coeffWiseOp); +BM_FuncGPU(algebraicFunc); +BM_FuncGPU(transcendentalFunc); +BM_FuncGPU(rowReduction); +BM_FuncGPU(colReduction); +BM_FuncGPU(fullReduction); + + +// Contractions +#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ + static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, D1, D2, D3); \ + suite.FUNC(iters); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000); + + +BM_FuncWithInputDimsGPU(contraction, N, N, N); +BM_FuncWithInputDimsGPU(contraction, 64, N, N); +BM_FuncWithInputDimsGPU(contraction, N, 64, N); +BM_FuncWithInputDimsGPU(contraction, N, N, 64); + + +// Convolutions +#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ + static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ + StopBenchmarkTiming(); \ + cl::sycl::gpu_selector selector; \ + Eigen::QueueInterface queue(selector); \ + Eigen::SyclDevice device(&queue); \ + BenchmarkSuite suite(device, N); \ + suite.FUNC(iters, DIM1, DIM2); \ + } \ + BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000); + +BM_FuncWithKernelDimsGPU(convolution, 7, 1); +BM_FuncWithKernelDimsGPU(convolution, 1, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 4); +BM_FuncWithKernelDimsGPU(convolution, 4, 7); +BM_FuncWithKernelDimsGPU(convolution, 7, 64); +BM_FuncWithKernelDimsGPU(convolution, 64, 7); +#endif diff --git a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc new file mode 100644 index 000000000..bcc3c4c79 --- /dev/null +++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc @@ -0,0 +1,2 @@ +#include "tensor_benchmarks_sycl.cc" +#include "tensor_benchmarks_sycl.sycl" diff --git a/blas/common.h b/blas/common.h index 61d8344d9..960c09cc6 100644 --- a/blas/common.h +++ b/blas/common.h @@ -10,6 +10,14 @@ #ifndef EIGEN_BLAS_COMMON_H #define EIGEN_BLAS_COMMON_H +#ifdef __GNUC__ +# if __GNUC__<5 +// GCC < 5.0 does not like the global Scalar typedef +// we just keep shadow-warnings disabled permanently +# define EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS +# endif +#endif + #include "../Eigen/Core" #include "../Eigen/Jacobi" diff --git a/blas/f2c/ctbmv.c b/blas/f2c/ctbmv.c index 790fd581f..a6e0dae80 100644 --- a/blas/f2c/ctbmv.c +++ b/blas/f2c/ctbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/dtbmv.c b/blas/f2c/dtbmv.c index fdf73ebb5..aa67d19da 100644 --- a/blas/f2c/dtbmv.c +++ b/blas/f2c/dtbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/stbmv.c b/blas/f2c/stbmv.c index fcf9ce336..b5a68b545 100644 --- a/blas/f2c/stbmv.c +++ b/blas/f2c/stbmv.c @@ -143,7 +143,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/f2c/ztbmv.c b/blas/f2c/ztbmv.c index 4cdcd7f88..3bf0beb01 100644 --- a/blas/f2c/ztbmv.c +++ b/blas/f2c/ztbmv.c @@ -147,7 +147,7 @@ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ -/* tranformed vector x. */ +/* transformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ diff --git a/blas/level1_impl.h b/blas/level1_impl.h index f857bfa20..6e7f8c976 100644 --- a/blas/level1_impl.h +++ b/blas/level1_impl.h @@ -33,7 +33,7 @@ int EIGEN_BLAS_FUNC(copy)(int *n, RealScalar *px, int *incx, RealScalar *py, int Scalar* x = reinterpret_cast(px); Scalar* y = reinterpret_cast(py); - // be carefull, *incx==0 is allowed !! + // be careful, *incx==0 is allowed !! if(*incx==1 && *incy==1) make_vector(y,*n) = make_vector(x,*n); else diff --git a/blas/testing/cblat1.f b/blas/testing/cblat1.f index 8ca67fb19..73015f5a9 100644 --- a/blas/testing/cblat1.f +++ b/blas/testing/cblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/dblat1.f b/blas/testing/dblat1.f index 30691f9bf..03d9f1345 100644 --- a/blas/testing/dblat1.f +++ b/blas/testing/dblat1.f @@ -990,7 +990,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/sblat1.f b/blas/testing/sblat1.f index 6657c2693..4d43d9b48 100644 --- a/blas/testing/sblat1.f +++ b/blas/testing/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/blas/testing/zblat1.f b/blas/testing/zblat1.f index d30112c63..c00b67dc8 100644 --- a/blas/testing/zblat1.f +++ b/blas/testing/zblat1.f @@ -619,7 +619,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/cmake/Eigen3Config.cmake.in b/cmake/Eigen3Config.cmake.in index c5c546887..0a1ac61c9 100644 --- a/cmake/Eigen3Config.cmake.in +++ b/cmake/Eigen3Config.cmake.in @@ -3,7 +3,9 @@ @PACKAGE_INIT@ -include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake") +if (NOT TARGET eigen) + include ("${CMAKE_CURRENT_LIST_DIR}/Eigen3Targets.cmake") +endif () # Legacy variables, do *not* use. May be removed in the future. diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index afc24b5e9..ba88228a0 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -11,16 +11,18 @@ add_custom_target(buildtests) add_custom_target(check COMMAND "ctest") add_dependencies(check buildtests) -# check whether /bin/bash exists -find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) +# check whether /bin/bash exists (disabled as not used anymore) +# find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) # This call activates testing and generates the DartConfiguration.tcl include(CTest) set(EIGEN_TEST_BUILD_FLAGS "" CACHE STRING "Options passed to the build command of unit tests") +set(EIGEN_DASHBOARD_BUILD_TARGET "buildtests" CACHE STRING "Target to be built in dashboard mode, default is buildtests") +set(EIGEN_CTEST_ERROR_EXCEPTION "" CACHE STRING "Regular expression for build error messages to be filtered out") # Overwrite default DartConfiguration.tcl such that ctest can build our unit tests. -# Recall that our unit tests are not in the "all" target, so we have to explicitely ask ctest to build our custom 'buildtests' target. +# Recall that our unit tests are not in the "all" target, so we have to explicitly ask ctest to build our custom 'buildtests' target. # At this stage, we can also add custom flags to the build tool through the user defined EIGEN_TEST_BUILD_FLAGS variable. file(READ "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" EIGEN_DART_CONFIG_FILE) # try to grab the default flags @@ -28,7 +30,7 @@ string(REGEX MATCH "MakeCommand:.*-- (.*)\nDefaultCTestConfigurationType" EIGEN_ if(NOT CMAKE_MATCH_1) string(REGEX MATCH "MakeCommand:.*[^c]make (.*)\nDefaultCTestConfigurationType" EIGEN_DUMMY ${EIGEN_DART_CONFIG_FILE}) endif() -string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target buildtests --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType" +string(REGEX REPLACE "MakeCommand:.*DefaultCTestConfigurationType" "MakeCommand: ${CMAKE_COMMAND} --build . --target ${EIGEN_DASHBOARD_BUILD_TARGET} --config \"\${CTEST_CONFIGURATION_TYPE}\" -- ${CMAKE_MATCH_1} ${EIGEN_TEST_BUILD_FLAGS}\nDefaultCTestConfigurationType" EIGEN_DART_CONFIG_FILE2 ${EIGEN_DART_CONFIG_FILE}) file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/DartConfiguration.tcl" ${EIGEN_DART_CONFIG_FILE2}) @@ -39,7 +41,7 @@ ei_init_testing() # configure Eigen related testing options option(EIGEN_NO_ASSERTION_CHECKING "Disable checking of assertions using exceptions" OFF) -option(EIGEN_DEBUG_ASSERTS "Enable advanced debuging of assertions" OFF) +option(EIGEN_DEBUG_ASSERTS "Enable advanced debugging of assertions" OFF) if(CMAKE_COMPILER_IS_GNUCXX) option(EIGEN_COVERAGE_TESTING "Enable/disable gcov" OFF) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index a92a2978b..35deed509 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -19,19 +19,28 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) - if(EIGEN_TEST_CUDA_CLANG) + if(EIGEN_TEST_HIP) + hip_reset_flags() + hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") + elseif(EIGEN_TEST_CUDA_CLANG) set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX) - if(CUDA_64_BIT_DEVICE_CODE) + + if(CUDA_64_BIT_DEVICE_CODE AND (EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")) link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") else() link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib") endif() + if (${ARGC} GREATER 2) add_executable(${targetname} ${filename}) else() add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) endif() - target_link_libraries(${targetname} "cudart_static" "cuda" "dl" "rt" "pthread") + set(CUDA_CLANG_LINK_LIBRARIES "cudart_static" "cuda" "dl" "pthread") + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(CUDA_CLANG_LINK_LIBRARIES ${CUDA_CLANG_LINK_LIBRARIES} "rt") + endif() + target_link_libraries(${targetname} ${CUDA_CLANG_LINK_LIBRARIES}) else() if (${ARGC} GREATER 2) cuda_add_executable(${targetname} ${filename} OPTIONS ${ARGV2}) @@ -59,8 +68,6 @@ macro(ei_add_test_internal testname testname_with_suffix) ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}") - if(MSVC) ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") endif() @@ -99,7 +106,7 @@ macro(ei_add_test_internal testname testname_with_suffix) add_test(${testname_with_suffix} "${targetname}") - # Specify target and test labels accoirding to EIGEN_CURRENT_SUBPROJECT + # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) if ((current_subproject) AND (NOT (current_subproject STREQUAL ""))) set_property(TARGET ${targetname} PROPERTY LABELS "Build${current_subproject}") @@ -111,7 +118,6 @@ endmacro(ei_add_test_internal) # SYCL macro(ei_add_test_internal_sycl testname testname_with_suffix) - include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) set(targetname ${testname_with_suffix}) if(EIGEN_ADD_TEST_FILENAME_EXTENSION) @@ -120,23 +126,31 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix) set(filename ${testname}.cpp) endif() - set( include_file ${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}) - set( bc_file ${CMAKE_CURRENT_BINARY_DIR}/${filename}) - set( host_file ${CMAKE_CURRENT_SOURCE_DIR}/${filename}) + set( include_file "${CMAKE_CURRENT_BINARY_DIR}/inc_${filename}") + set( bc_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.sycl") + set( host_file "${CMAKE_CURRENT_SOURCE_DIR}/${filename}") - ADD_CUSTOM_COMMAND( - OUTPUT ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} - COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}.sycl\\\"" >> ${include_file} - DEPENDS ${filename} ${bc_file}.sycl - COMMENT "Building ComputeCpp integration header file ${include_file}" - ) - # Add a custom target for the generated integration header - add_custom_target(${testname}_integration_header_sycl DEPENDS ${include_file}) + if(NOT EIGEN_SYCL_TRISYCL) + include_directories( SYSTEM ${COMPUTECPP_PACKAGE_ROOT_DIR}/include) - add_executable(${targetname} ${include_file}) - add_dependencies(${targetname} ${testname}_integration_header_sycl) - add_sycl_to_target(${targetname} ${filename} ${CMAKE_CURRENT_BINARY_DIR}) + ADD_CUSTOM_COMMAND( + OUTPUT ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${host_file}\\\"" > ${include_file} + COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\\"${bc_file}\\\"" >> ${include_file} + DEPENDS ${filename} ${bc_file} + COMMENT "Building ComputeCpp integration header file ${include_file}" + ) + + # Add a custom target for the generated integration header + add_custom_target("${testname}_integration_header_sycl" DEPENDS ${include_file}) + + add_executable(${targetname} ${include_file}) + add_dependencies(${targetname} "${testname}_integration_header_sycl") + else() + add_executable(${targetname} ${host_file}) + endif() + + add_sycl_to_target(${targetname} ${CMAKE_CURRENT_BINARY_DIR} ${filename}) if (targetname MATCHES "^eigen2_") add_dependencies(eigen2_buildtests ${targetname}) @@ -154,8 +168,6 @@ macro(ei_add_test_internal_sycl testname testname_with_suffix) ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}") - ei_add_target_property(${targetname} COMPILE_FLAGS "-DEIGEN_TEST_FUNC=${testname}") - if(MSVC AND NOT EIGEN_SPLIT_LARGE_TESTS) ei_add_target_property(${targetname} COMPILE_FLAGS "/bigobj") endif() @@ -240,7 +252,7 @@ endmacro(ei_add_test_internal_sycl) # # If EIGEN_SPLIT_LARGE_TESTS is ON, the test is split into multiple executables # test__ -# where N runs from 1 to the greatest occurence found in the source file. Each of these +# where N runs from 1 to the greatest occurrence found in the source file. Each of these # executables is built passing -DEIGEN_TEST_PART_N. This allows to split large tests # into smaller executables. # @@ -260,26 +272,28 @@ macro(ei_add_test testname) endif() file(READ "${filename}" test_source) - set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) - if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) + set(explicit_suffixes "") + if( (NOT EIGEN_SPLIT_LARGE_TESTS) AND suffixes) + # Check whether we have EIGEN_TEST_PART_* statements, in which case we likely must enforce splitting. + # For instance, indexed_view activate a different c++ version for each part. + string(REGEX MATCHALL "EIGEN_TEST_PART_[0-9]+" occurrences "${test_source}") + string(REGEX REPLACE "EIGEN_TEST_PART_" "" explicit_suffixes "${occurrences}") + list(REMOVE_DUPLICATES explicit_suffixes) + endif() + if( (EIGEN_SPLIT_LARGE_TESTS AND suffixes) OR explicit_suffixes) add_custom_target(${testname}) foreach(suffix ${suffixes}) ei_add_test_internal(${testname} ${testname}_${suffix} "${ARGV1} -DEIGEN_TEST_PART_${suffix}=1" "${ARGV2}") add_dependencies(${testname} ${testname}_${suffix}) endforeach(suffix) - else(EIGEN_SPLIT_LARGE_TESTS AND suffixes) - set(symbols_to_enable_all_parts "") - foreach(suffix ${suffixes}) - set(symbols_to_enable_all_parts - "${symbols_to_enable_all_parts} -DEIGEN_TEST_PART_${suffix}=1") - endforeach(suffix) - ei_add_test_internal(${testname} ${testname} "${ARGV1} ${symbols_to_enable_all_parts}" "${ARGV2}") - endif(EIGEN_SPLIT_LARGE_TESTS AND suffixes) + else() + ei_add_test_internal(${testname} ${testname} "${ARGV1} -DEIGEN_TEST_PART_ALL=1" "${ARGV2}") + endif() endmacro(ei_add_test) macro(ei_add_test_sycl testname) @@ -296,8 +310,8 @@ macro(ei_add_test_sycl testname) file(READ "${filename}" test_source) set(parts 0) string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+|EIGEN_TEST_PART_[0-9]+|EIGEN_SUFFIXES(;[0-9]+)+" - occurences "${test_source}") - string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurences}") + occurrences "${test_source}") + string(REGEX REPLACE "CALL_SUBTEST_|EIGEN_TEST_PART_|EIGEN_SUFFIXES" "" suffixes "${occurrences}") list(REMOVE_DUPLICATES suffixes) if(EIGEN_SPLIT_LARGE_TESTS AND suffixes) add_custom_target(${testname}) @@ -442,6 +456,12 @@ macro(ei_testing_print_summary) message(STATUS "VSX: Using architecture defaults") endif() + if(EIGEN_TEST_MSA) + message(STATUS "MIPS MSA: ON") + else() + message(STATUS "MIPS MSA: Using architecture defaults") + endif() + if(EIGEN_TEST_NEON) message(STATUS "ARM NEON: ON") else() @@ -467,7 +487,11 @@ macro(ei_testing_print_summary) endif() if(EIGEN_TEST_SYCL) - message(STATUS "SYCL: ON") + if(EIGEN_SYCL_TRISYCL) + message(STATUS "SYCL: ON (using triSYCL)") + else() + message(STATUS "SYCL: ON (using computeCPP)") + endif() else() message(STATUS "SYCL: OFF") endif() @@ -480,6 +504,11 @@ macro(ei_testing_print_summary) else() message(STATUS "CUDA: OFF") endif() + if(EIGEN_TEST_HIP) + message(STATUS "HIP: ON (using hipcc)") + else() + message(STATUS "HIP: OFF") + endif() endif() # vectorization / alignment options @@ -538,6 +567,8 @@ macro(ei_get_compilerver VAR) else() set(${VAR} "na") endif() + elseif(${CMAKE_CXX_COMPILER_ID} MATCHES "PGI") + set(${VAR} "${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") else() # on all other system we rely on ${CMAKE_CXX_COMPILER} # supporting a "--version" or "/version" flag @@ -634,6 +665,8 @@ macro(ei_get_cxxflags VAR) set(${VAR} SSE3) elseif(EIGEN_TEST_SSE2 OR IS_64BIT_ENV) set(${VAR} SSE2) + elseif(EIGEN_TEST_MSA) + set(${VAR} MSA) endif() if(EIGEN_TEST_OPENMP) @@ -666,6 +699,10 @@ macro(ei_set_build_string) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-${LOCAL_COMPILER_FLAGS}) endif() + if(EIGEN_TEST_EXTERNAL_BLAS) + set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-external_blas) + endif() + ei_is_64bit_env(IS_64BIT_ENV) if(NOT IS_64BIT_ENV) set(TMP_BUILD_STRING ${TMP_BUILD_STRING}-32bit) diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake index 68c4e0724..e3395bc10 100644 --- a/cmake/FindBLAS.cmake +++ b/cmake/FindBLAS.cmake @@ -1,385 +1,1363 @@ -# Find BLAS library +### # -# This module finds an installed library that implements the BLAS +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS library +# This module finds an installed fortran library that implements the BLAS # linear-algebra interface (see http://www.netlib.org/blas/). -# The list of libraries searched for is mainly taken +# The list of libraries searched for is taken # from the autoconf macro file, acx_blas.m4 (distributed at # http://ac-archive.sourceforge.net/ac-archive/acx_blas.html). # # This module sets the following variables: # BLAS_FOUND - set to true if a library implementing the BLAS interface # is found -# BLAS_INCLUDE_DIR - Directories containing the BLAS header files -# BLAS_DEFINITIONS - Compilation options to use BLAS -# BLAS_LINKER_FLAGS - Linker flags to use BLAS (excluding -l +# BLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l # and -L). -# BLAS_LIBRARIES_DIR - Directories containing the BLAS libraries. -# May be null if BLAS_LIBRARIES contains libraries name using full path. -# BLAS_LIBRARIES - List of libraries to link against BLAS interface. -# May be null if the compiler supports auto-link (e.g. VC++). -# BLAS_USE_FILE - The name of the cmake module to include to compile -# applications or libraries using BLAS. +# BLAS_COMPILER_FLAGS - uncached list of required compiler flags (including -I for mkl headers). +# BLAS_LIBRARIES - uncached list of libraries (using full path name) to +# link against to use BLAS +# BLAS95_LIBRARIES - uncached list of libraries (using full path name) +# to link against to use BLAS95 interface +# BLAS95_FOUND - set to true if a library implementing the BLAS f95 interface +# is found +# BLA_STATIC if set on this determines what kind of linkage we do (static) +# BLA_VENDOR if set checks only the specified vendor, if not set checks +# all the possibilities +# BLAS_VENDOR_FOUND stores the BLAS vendor found +# BLA_F95 if set on tries to find the f95 interfaces for BLAS/LAPACK +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DBLAS_DIR=path/to/blas): +# BLAS_DIR - Where to find the base directory of blas +# BLAS_INCDIR - Where to find the header files +# BLAS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: BLAS_DIR, BLAS_INCDIR, BLAS_LIBDIR +# For MKL case and if no paths are given as hints, we will try to use the MKLROOT +# environment variable +# BLAS_VERBOSE Print some additional information during BLAS libraries detection +########## +### List of vendors (BLA_VENDOR) valid in this module +########## List of vendors (BLA_VENDOR) valid in this module +## Open (for OpenBlas), Eigen (for EigenBlas), Goto, ATLAS PhiPACK, +##  CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT +## Intel10_32 (intel mkl v10 32 bit), Intel10_64lp (intel mkl v10 64 bit,lp thread model, lp64 model), +## Intel10_64lp_seq (intel mkl v10 64 bit,sequential code, lp64 model), +## Intel( older versions of mkl 32 and 64 bit), +##  ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic +# C/CXX should be enabled to use Intel mkl +### +# We handle different modes to find the dependency # -# This module was modified by CGAL team: -# - find libraries for a C++ compiler, instead of Fortran -# - added BLAS_INCLUDE_DIR, BLAS_DEFINITIONS and BLAS_LIBRARIES_DIR -# - removed BLAS95_LIBRARIES +# - Detection if already installed on the system +# - BLAS libraries can be detected from different ways +# Here is the order of precedence: +# 1) we look in cmake variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 2) we look in environment variable BLAS_LIBDIR or BLAS_DIR (we guess the libdirs) if defined +# 3) we look in common environnment variables depending on the system (INCLUDE, C_INCLUDE_PATH, CPATH - LIB, DYLD_LIBRARY_PATH, LD_LIBRARY_PATH) +# 4) we look in common system paths depending on the system, see for example paths contained in the following cmake variables: +# - CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES +# - CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES, CMAKE_C_IMPLICIT_LINK_DIRECTORIES +# + +#============================================================================= +# Copyright 2007-2009 Kitware, Inc. +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +## Some macros to print status when search for headers and libs +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_Status _libname _lib_to_find) + + # save _libname upper/lower case + string(TOUPPER ${_libname} LIBNAME) + string(TOLOWER ${_libname} libname) + + # print status + #message(" ") + if(${LIBNAME}_LIBDIR) + message("${Yellow}${LIBNAME}_LIBDIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + if(${LIBNAME}_DIR) + message("${Yellow}${LIBNAME}_DIR is defined but ${_lib_to_find}" + "has not been found in ${ARGN}${ColourReset}") + else() + message("${Yellow}${_lib_to_find} not found." + "Nor ${LIBNAME}_DIR neither ${LIBNAME}_LIBDIR" + "are defined so that we look for ${_lib_to_find} in" + "system paths (Linux: LD_LIBRARY_PATH, Windows: LIB," + "Mac: DYLD_LIBRARY_PATH," + "CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES," + "CMAKE_C_IMPLICIT_LINK_DIRECTORIES)${ColourReset}") + if(_lib_env) + message("${Yellow}${_lib_to_find} has not been found in" + "${_lib_env}${ColourReset}") + endif() + endif() + endif() + message("${BoldYellow}Please indicate where to find ${_lib_to_find}. You have three options:\n" + "- Option 1: Provide the Installation directory of BLAS library with cmake option: -D${LIBNAME}_DIR=your/path/to/${libname}/\n" + "- Option 2: Provide the directory where to find the library with cmake option: -D${LIBNAME}_LIBDIR=your/path/to/${libname}/lib/\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "- Option 4: If your library provides a PkgConfig file, make sure pkg-config finds your library${ColourReset}") + +endmacro() + +# This macro informs why the _lib_to_find file has not been found +macro(Print_Find_Library_Blas_CheckFunc_Status _name) + + # save _libname upper/lower case + string(TOUPPER ${_name} FUNCNAME) + string(TOLOWER ${_name} funcname) + + # print status + #message(" ") + message("${Red}Libs have been found but check of symbol ${_name} failed " + "with following libraries ${ARGN}${ColourReset}") + message("${BoldRed}Please open your error file CMakeFiles/CMakeError.log" + "to figure out why it fails${ColourReset}") + #message(" ") + +endmacro() + +if (NOT BLAS_FOUND) + set(BLAS_DIR "" CACHE PATH "Installation directory of BLAS library") + if (NOT BLAS_FIND_QUIETLY) + message(STATUS "A cache variable, namely BLAS_DIR, has been set to specify the install directory of BLAS") + endif() +endif() + +option(BLAS_VERBOSE "Print some additional information during BLAS libraries detection" OFF) +mark_as_advanced(BLAS_VERBOSE) include(CheckFunctionExists) +include(CheckFortranFunctionExists) +set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) -# This macro checks for the existence of the combination of fortran libraries -# given by _list. If the combination is found, this macro checks (using the -# check_function_exists macro) whether can link against that library -# combination using the name of a routine given by _name using the linker -# flags given by _flags. If the combination of libraries is found and passes -# the link test, LIBRARIES is set to the list of complete library paths that -# have been found and DEFINITIONS to the required definitions. -# Otherwise, LIBRARIES is set to FALSE. -# N.B. _prefix is the prefix applied to the names of all cached variables that -# are generated internally and marked advanced by this macro. -macro(check_fortran_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _path) - #message("DEBUG: check_fortran_libraries(${_list} in ${_path})") +# Check the language being used +get_property( _LANGUAGES_ GLOBAL PROPERTY ENABLED_LANGUAGES ) +if( _LANGUAGES_ MATCHES Fortran AND CMAKE_Fortran_COMPILER) + set( _CHECK_FORTRAN TRUE ) +elseif( (_LANGUAGES_ MATCHES C) OR (_LANGUAGES_ MATCHES CXX) ) + set( _CHECK_FORTRAN FALSE ) +else() + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.") + else() + message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)") + return() + endif() +endif() - # Check for the existence of the libraries given by _list - set(_libraries_found TRUE) - set(_libraries_work FALSE) - set(${DEFINITIONS} "") - set(${LIBRARIES} "") +macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread) + # This macro checks for the existence of the combination of fortran libraries + # given by _list. If the combination is found, this macro checks (using the + # Check_Fortran_Function_Exists macro) whether can link against that library + # combination using the name of a routine given by _name using the linker + # flags given by _flags. If the combination of libraries is found and passes + # the link test, LIBRARIES is set to the list of complete library paths that + # have been found. Otherwise, LIBRARIES is set to FALSE. + + # N.B. _prefix is the prefix applied to the names of all cached variables that + # are generated internally and marked advanced by this macro. + + set(_libdir ${ARGN}) + + set(_libraries_work TRUE) + set(${LIBRARIES}) set(_combined_name) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_LIBDIR "$ENV{BLAS_LIBDIR}") + if (NOT _libdir) + if (BLAS_LIBDIR) + list(APPEND _libdir "${BLAS_LIBDIR}") + elseif (BLAS_DIR) + list(APPEND _libdir "${BLAS_DIR}") + list(APPEND _libdir "${BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${BLAS_DIR}/lib64") + list(APPEND _libdir "${BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${BLAS_DIR}/lib32") + list(APPEND _libdir "${BLAS_DIR}/lib/ia32") + endif() + elseif(ENV_BLAS_LIBDIR) + list(APPEND _libdir "${ENV_BLAS_LIBDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _libdir "${ENV_BLAS_DIR}") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib64") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/intel64") + else() + list(APPEND _libdir "${ENV_BLAS_DIR}/lib32") + list(APPEND _libdir "${ENV_BLAS_DIR}/lib/ia32") + endif() + else() + if (ENV_MKLROOT) + list(APPEND _libdir "${ENV_MKLROOT}/lib") + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + list(APPEND _libdir "${ENV_MKLROOT}/lib64") + list(APPEND _libdir "${ENV_MKLROOT}/lib/intel64") + else() + list(APPEND _libdir "${ENV_MKLROOT}/lib32") + list(APPEND _libdir "${ENV_MKLROOT}/lib/ia32") + endif() + endif() + if (WIN32) + string(REPLACE ":" ";" _libdir2 "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir2 "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir2 "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${_libdir2}") + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif () + + if (BLAS_VERBOSE) + message("${Cyan}Try to find BLAS libraries: ${_list}") + endif () + foreach(_library ${_list}) set(_combined_name ${_combined_name}_${_library}) - if(_libraries_found) - # search first in ${_path} - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ${_path} NO_DEFAULT_PATH - ) - # if not found, search in environment variables and system - if ( WIN32 ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS ENV LIB - ) - elseif ( APPLE ) - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV DYLD_LIBRARY_PATH - ) + if(_libraries_work) + if (BLA_STATIC) + if (WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () + if (APPLE) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else () + set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif () else () - find_library(${_prefix}_${_library}_LIBRARY - NAMES ${_library} - PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV LD_LIBRARY_PATH - ) - endif() + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # for ubuntu's libblas3gf and liblapack3gf packages + set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf) + endif () + endif () + find_library(${_prefix}_${_library}_LIBRARY + NAMES ${_library} + HINTS ${_libdir} + NO_DEFAULT_PATH + ) mark_as_advanced(${_prefix}_${_library}_LIBRARY) + # Print status if not found + # ------------------------- + if (NOT ${_prefix}_${_library}_LIBRARY AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_Status(blas ${_library} ${_libdir}) + endif () set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) - set(_libraries_found ${${_prefix}_${_library}_LIBRARY}) - endif(_libraries_found) + set(_libraries_work ${${_prefix}_${_library}_LIBRARY}) + endif(_libraries_work) endforeach(_library ${_list}) - if(_libraries_found) - set(_libraries_found ${${LIBRARIES}}) - endif() - # Test this combination of libraries with the Fortran/f2c interface. - # We test the Fortran interface first as it is well standardized. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "-D${_prefix}_USE_F2C") - set(${LIBRARIES} ${_libraries_found}) - # Some C++ linkers require the f2c library to link with Fortran libraries. - # I do not know which ones, thus I just add the f2c library if it is available. - find_package( F2C QUIET ) - if ( F2C_FOUND ) - set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) - set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) + if(_libraries_work) + # Test this combination of libraries. + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND BLA_STATIC) + list(INSERT ${LIBRARIES} 0 "-Wl,--start-group") + list(APPEND ${LIBRARIES} "-Wl,--end-group") endif() - set(CMAKE_REQUIRED_DEFINITIONS ${${DEFINITIONS}}) - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}}) - #message("DEBUG: CMAKE_REQUIRED_DEFINITIONS = ${CMAKE_REQUIRED_DEFINITIONS}") - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - # Check if function exists with f2c calling convention (ie a trailing underscore) - check_function_exists(${_name}_ ${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(CMAKE_REQUIRED_DEFINITIONS} "") - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}_${_combined_name}_f2c_WORKS) - set(_libraries_work ${${_prefix}_${_name}_${_combined_name}_f2c_WORKS}) - endif(_libraries_found AND NOT _libraries_work) - - # If not found, test this combination of libraries with a C interface. - # A few implementations (ie ACML) provide a C interface. Unfortunately, there is no standard. - if(_libraries_found AND NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} ${_libraries_found}) - set(CMAKE_REQUIRED_DEFINITIONS "") - set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}}) - #message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}") - check_function_exists(${_name} ${_prefix}_${_name}${_combined_name}_WORKS) - set(CMAKE_REQUIRED_LIBRARIES "") - mark_as_advanced(${_prefix}_${_name}${_combined_name}_WORKS) - set(_libraries_work ${${_prefix}_${_name}${_combined_name}_WORKS}) - endif(_libraries_found AND NOT _libraries_work) - - # on failure - if(NOT _libraries_work) - set(${DEFINITIONS} "") - set(${LIBRARIES} FALSE) + set(CMAKE_REQUIRED_LIBRARIES "${_flags};${${LIBRARIES}};${_thread}") + set(CMAKE_REQUIRED_FLAGS "${BLAS_COMPILER_FLAGS}") + if (BLAS_VERBOSE) + message("${Cyan}BLAS libs found for BLA_VENDOR ${BLA_VENDOR}." + "Try to compile symbol ${_name} with following libraries:" + "${CMAKE_REQUIRED_LIBRARIES}") + endif () + if(NOT BLAS_FOUND) + unset(${_prefix}${_combined_name}_WORKS CACHE) + endif() + if (_CHECK_FORTRAN) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + string(REPLACE "mkl_intel_lp64" "mkl_gf_lp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + string(REPLACE "mkl_intel_ilp64" "mkl_gf_ilp64" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + endif() + check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS) + else() + check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS) + endif() + mark_as_advanced(${_prefix}${_combined_name}_WORKS) + set(_libraries_work ${${_prefix}${_combined_name}_WORKS}) + # Print status if not found + # ------------------------- + if (NOT _libraries_work AND NOT BLAS_FIND_QUIETLY AND BLAS_VERBOSE) + Print_Find_Library_Blas_CheckFunc_Status(${_name} ${CMAKE_REQUIRED_LIBRARIES}) + endif () + set(CMAKE_REQUIRED_LIBRARIES) endif() - #message("DEBUG: ${DEFINITIONS} = ${${DEFINITIONS}}") - #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}") -endmacro(check_fortran_libraries) + + if(_libraries_work) + set(${LIBRARIES} ${${LIBRARIES}} ${_thread}) + else(_libraries_work) + set(${LIBRARIES} FALSE) + endif(_libraries_work) + +endmacro(Check_Fortran_Libraries) -# -# main -# +set(BLAS_LINKER_FLAGS) +set(BLAS_LIBRARIES) +set(BLAS95_LIBRARIES) +if ($ENV{BLA_VENDOR} MATCHES ".+") + set(BLA_VENDOR $ENV{BLA_VENDOR}) +else () + if(NOT BLA_VENDOR) + set(BLA_VENDOR "All") + endif() +endif () -# Is it already configured? -if (BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) +#BLAS in intel mkl 10 library? (em64t 64bit) +if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") - set(BLAS_FOUND TRUE) + if(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") + # Looking for include + # ------------------- -else() + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + set(ENV_MKLROOT "$ENV{MKLROOT}") + set(ENV_BLAS_DIR "$ENV{BLAS_DIR}") + set(ENV_BLAS_INCDIR "$ENV{BLAS_INCDIR}") + if(ENV_BLAS_INCDIR) + list(APPEND _inc_env "${ENV_BLAS_INCDIR}") + elseif(ENV_BLAS_DIR) + list(APPEND _inc_env "${ENV_BLAS_DIR}") + list(APPEND _inc_env "${ENV_BLAS_DIR}/include") + else() + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + # system variables + if(WIN32) + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) - # reset variables - set( BLAS_INCLUDE_DIR "" ) - set( BLAS_DEFINITIONS "" ) - set( BLAS_LINKER_FLAGS "" ) - set( BLAS_LIBRARIES "" ) - set( BLAS_LIBRARIES_DIR "" ) + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") - # - # If Unix, search for BLAS function in possible libraries - # + # Try to find the fftw header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(BLAS_INCDIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES "include") + else() + set(BLAS_mkl.h_DIRS "BLAS_mkl.h_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_DIRS + NAMES mkl.h + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_DIRS) - # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + # If found, add path to cmake variable + # ------------------------------------ + if (BLAS_mkl.h_DIRS) + set(BLAS_INCLUDE_DIRS "${BLAS_mkl.h_DIRS}") + else () + set(BLAS_INCLUDE_DIRS "BLAS_INCLUDE_DIRS-NOTFOUND") + if(NOT BLAS_FIND_QUIETLY) + message(STATUS "Looking for BLAS -- mkl.h not found") + endif() + endif() + + if (WIN32) + string(REPLACE ":" ";" _libdir "$ENV{LIB}") + elseif (APPLE) + string(REPLACE ":" ";" _libdir "$ENV{DYLD_LIBRARY_PATH}") + else () + string(REPLACE ":" ";" _libdir "$ENV{LD_LIBRARY_PATH}") + endif () + list(APPEND _libdir "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + # libiomp5 + # -------- + set(OMP_iomp5_LIBRARY "OMP_iomp5_LIBRARY-NOTFOUND") + find_library(OMP_iomp5_LIBRARY + NAMES iomp5 + HINTS ${_libdir} + ) + mark_as_advanced(OMP_iomp5_LIBRARY) + set(OMP_LIB "") + # libgomp + # ------- + set(OMP_gomp_LIBRARY "OMP_gomp_LIBRARY-NOTFOUND") + find_library(OMP_gomp_LIBRARY + NAMES gomp + HINTS ${_libdir} + ) + mark_as_advanced(OMP_gomp_LIBRARY) + # choose one or another depending on the compilo + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (OMP_gomp_LIBRARY) + set(OMP_LIB "${OMP_gomp_LIBRARY}") + endif() + else(CMAKE_C_COMPILER_ID STREQUAL "Intel") + if (OMP_iomp5_LIBRARY) + set(OMP_LIB "${OMP_iomp5_LIBRARY}") + endif() + endif() + + if (UNIX AND NOT WIN32) + # m + find_library(M_LIBRARY + NAMES m + HINTS ${_libdir}) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + set(LM "-lm") + else() + set(LM "") + endif() + # Fortran + set(LGFORTRAN "") + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + set(LGFORTRAN "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_libdir} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + set(LGFORTRAN "{FORTRAN_ifcore_LIBRARY}") + endif() + endif() + set(BLAS_COMPILER_FLAGS "") + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_COMPILER_FLAGS "-openmp") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_COMPILER_FLAGS "-fopenmp") + endif() + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + if (BLA_VENDOR STREQUAL "Intel10_32") + list(APPEND BLAS_COMPILER_FLAGS "-m32") + else() + list(APPEND BLAS_COMPILER_FLAGS "-m64") + endif() + if (NOT BLA_VENDOR STREQUAL "Intel10_64lp_seq") + list(APPEND OMP_LIB "-ldl") + endif() + if (ENV_MKLROOT) + list(APPEND BLAS_COMPILER_FLAGS "-I${ENV_MKLROOT}/include") + endif() + endif() + + set(additional_flags "") + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(additional_flags "-Wl,--no-as-needed") + endif() + endif () + + if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) + if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) + find_package(Threads) + else() + find_package(Threads REQUIRED) + endif() + + set(BLAS_SEARCH_LIBS "") + + if(BLA_F95) + + set(BLAS_mkl_SEARCH_SYMBOL SGEMM) + set(_LIBRARIES BLAS95_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_blas95_lp64${BLAS_mkl_DLL_SUFFIX} mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else (WIN32) + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95 mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_blas95_lp64 mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + endif (WIN32) + + else (BLA_F95) + + set(BLAS_mkl_SEARCH_SYMBOL sgemm) + set(_LIBRARIES BLAS_LIBRARIES) + if (WIN32) + if (BLA_STATIC) + set(BLAS_mkl_DLL_SUFFIX "") + else() + set(BLAS_mkl_DLL_SUFFIX "_dll") + endif() + + # Find the main file (32-bit or 64-bit) + set(BLAS_SEARCH_LIBS_WIN_MAIN "") + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "Intel10_64lp*" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN + "mkl_intel_lp64${BLAS_mkl_DLL_SUFFIX}") + endif () + + # Add threading/sequential libs + set(BLAS_SEARCH_LIBS_WIN_THREAD "") + if (NOT BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + # mkl >= 10.3 + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}") + endif() + if (BLA_VENDOR STREQUAL "*_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD + "mkl_sequential${BLAS_mkl_DLL_SUFFIX}") + endif() + + # Cartesian product of the above + foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN}) + foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD}) + list(APPEND BLAS_SEARCH_LIBS + "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}") + endforeach() + endforeach() + else (WIN32) + if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel mkl_intel_thread mkl_core guide") + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp" OR BLA_VENDOR STREQUAL "All") + # old version + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core guide") + # mkl >= 10.3 + if (CMAKE_C_COMPILER_ID STREQUAL "Intel") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_intel_thread mkl_core") + endif() + if (CMAKE_C_COMPILER_ID STREQUAL "GNU") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_gnu_thread mkl_core") + endif() + endif () + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl_intel_lp64 mkl_sequential mkl_core") + if (BLA_VENDOR STREQUAL "Intel10_64lp_seq") + set(OMP_LIB "") + endif() + endif () + #older vesions of intel mkl libs + if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All") + list(APPEND BLAS_SEARCH_LIBS + "mkl") + list(APPEND BLAS_SEARCH_LIBS + "mkl_ia32") + list(APPEND BLAS_SEARCH_LIBS + "mkl_em64t") + endif () + endif (WIN32) + + endif (BLA_F95) + + foreach (IT ${BLAS_SEARCH_LIBS}) + string(REPLACE " " ";" SEARCH_LIBS ${IT}) + if (${_LIBRARIES}) + else () + check_fortran_libraries( + ${_LIBRARIES} + BLAS + ${BLAS_mkl_SEARCH_SYMBOL} + "${additional_flags}" + "${SEARCH_LIBS}" + "${OMP_LIB};${CMAKE_THREAD_LIBS_INIT};${LM}" + ) + if(_LIBRARIES) + set(BLAS_LINKER_FLAGS "${additional_flags}") + endif() + endif() + endforeach () + if(NOT BLAS_FIND_QUIETLY) + if(${_LIBRARIES}) + message(STATUS "Looking for MKL BLAS: found") + else() + message(STATUS "Looking for MKL BLAS: not found") + endif() + endif() + if (${_LIBRARIES} AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Intel MKL") + endif() + endif (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) + endif(NOT BLAS_LIBRARIES OR BLA_VENDOR MATCHES "Intel*") +endif (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "cblas;f77blas;atlas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "goto2" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Goto BLAS: found") + else() + message(STATUS "Looking for Goto BLAS: not found") + endif() endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Goto") + endif() - # BLAS in PhiPACK libraries? (requires generic BLAS lib, too) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS +endif (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All") + + +# OpenBlas +if (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # openblas (http://www.openblas.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "openblas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Open BLAS: found") + else() + message(STATUS "Looking for Open BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Openblas") + endif() + +endif (BLA_VENDOR STREQUAL "Open" OR BLA_VENDOR STREQUAL "All") + + +# EigenBlas +if (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + + if(NOT BLAS_LIBRARIES) + # eigenblas (http://eigen.tuxfamily.org/index.php?title=Main_Page) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "eigen_blas_static" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Eigen BLAS: found") + else() + message(STATUS "Looking for Eigen BLAS: not found") + endif() + endif() + endif() + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Eigen") + endif() + +endif (BLA_VENDOR STREQUAL "Eigen" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/) + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + dgemm + "" + "f77blas;atlas" + "" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Atlas BLAS: found") + else() + message(STATUS "Looking for Atlas BLAS: not found") + endif() + endif() + endif() + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Atlas") + endif() + +endif (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in PhiPACK libraries? (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "sgemm;dgemm;blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for PhiPACK BLAS: found") + else() + message(STATUS "Looking for PhiPACK BLAS: not found") + endif() endif() + endif() - # BLAS in Alpha CXML library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "PhiPACK") + endif() + +endif (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Alpha CXML library? +if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "cxml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for CXML BLAS: found") + else() + message(STATUS "Looking for CXML BLAS: not found") + endif() endif() + endif() - # BLAS in Alpha DXML library? (now called CXML, see above) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "CXML") + endif() + +endif (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Alpha DXML library? (now called CXML, see above) +if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "dxml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for DXML BLAS: found") + else() + message(STATUS "Looking for DXML BLAS: not found") + endif() endif() + endif() - # BLAS in Sun Performance library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "DXML") + endif() + +endif (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All") + + +# BLAS in Sun Performance library? +if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "-xlic_lib=sunperf" "sunperf;sunmath" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(BLAS_LIBRARIES) + set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + endif() + if(NOT BLAS_FIND_QUIETLY) if(BLAS_LIBRARIES) - # Extra linker flag - set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf") + message(STATUS "Looking for SunPerf BLAS: found") + else() + message(STATUS "Looking for SunPerf BLAS: not found") endif() endif() + endif() - # BLAS in SCSL library? (SGI/Cray Scientific Library) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SCSL library? (SGI/Cray Scientific Library) +if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "scsl" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SCSL BLAS: found") + else() + message(STATUS "Looking for SCSL BLAS: not found") + endif() endif() + endif() - # BLAS in SGIMATH library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SunPerf") + endif() + +endif () + + +# BLAS in SGIMATH library? +if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" "complib.sgimath" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for SGIMATH BLAS: found") + else() + message(STATUS "Looking for SGIMATH BLAS: not found") + endif() endif() + endif() - # BLAS in IBM ESSL library? (requires generic BLAS lib, too) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "SGIMATH") + endif() + +endif () + + +# BLAS in IBM ESSL library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "essl;blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "essl;xlfmath;xlf90_r;blas" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL BLAS: found") + else() + message(STATUS "Looking for IBM ESSL BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl 10 library? (em64t 64bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL") + endif() + +endif () + +# BLAS in IBM ESSL_MT library (requires generic BLAS lib, too) +if (BLA_VENDOR STREQUAL "IBMESSLMT" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_intel_lp64;mkl_intel_thread;mkl_core;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif() - - ### windows version of intel mkl 10? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - SGEMM + "esslsmp;xlsmp;xlfmath;xlf90_r;blas" "" - "mkl_c_dll;mkl_intel_thread_dll;mkl_core_dll;libguide40" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for IBM ESSL MT BLAS: found") + else() + message(STATUS "Looking for IBM ESSL MT BLAS: not found") + endif() endif() + endif() - #older versions of intel mkl libs + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "IBM ESSL MT") + endif() - # BLAS in intel mkl library? (shared) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS +endif () + + +#BLAS in acml library? +if (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") + + if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR + ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS))) + + # try to find acml in "standard" paths + if( WIN32 ) + file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" ) + else() + file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" ) + endif() + if( WIN32 ) + file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" ) + else() + file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" ) + endif() + list(GET _ACML_ROOT 0 _ACML_ROOT) + list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT) + + if( _ACML_ROOT ) + + get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH ) + if( SIZEOF_INTEGER EQUAL 8 ) + set( _ACML_PATH_SUFFIX "_int64" ) + else() + set( _ACML_PATH_SUFFIX "" ) + endif() + if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" ) + set( _ACML_COMPILER32 "ifort32" ) + set( _ACML_COMPILER64 "ifort64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" ) + set( _ACML_COMPILER32 "sun32" ) + set( _ACML_COMPILER64 "sun64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) + set( _ACML_COMPILER32 "pgi32" ) + if( WIN32 ) + set( _ACML_COMPILER64 "win64" ) + else() + set( _ACML_COMPILER64 "pgi64" ) + endif() + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" ) + # 32 bit builds not supported on Open64 but for code simplicity + # We'll just use the same directory twice + set( _ACML_COMPILER32 "open64_64" ) + set( _ACML_COMPILER64 "open64_64" ) + elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" ) + set( _ACML_COMPILER32 "nag32" ) + set( _ACML_COMPILER64 "nag64" ) + else() + set( _ACML_COMPILER32 "gfortran32" ) + set( _ACML_COMPILER64 "gfortran64" ) + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + set(_ACML_MP_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" ) + else() + set(_ACML_LIB_DIRS + "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib" + "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" ) + endif() + + endif(_ACML_ROOT) + + elseif(BLAS_${BLA_VENDOR}_LIB_DIRS) + + set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS}) + + endif() + + if( BLA_VENDOR STREQUAL "ACML_MP" ) + foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + elseif( BLA_VENDOR STREQUAL "ACML_GPU" ) + foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS}) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + else() + foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} ) + check_fortran_libraries ( + BLAS_LIBRARIES + BLAS + sgemm + "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS} + ) + if( BLAS_LIBRARIES ) + break() + endif() + endforeach() + endif() + + # Either acml or acml_mp should be in LD_LIBRARY_PATH but not both + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "acml;acml_mv" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl library? (static, 32bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_ia32;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "acml_mp;acml_mv" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - #BLAS in intel mkl library? (static, em64t 64bit) - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS sgemm "" - "mkl_em64t;guide;pthread" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif() - - #BLAS in acml library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - sgemm + "acml;acml_mv;CALBLAS" "" - "acml" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for ACML BLAS: found") + else() + message(STATUS "Looking for ACML BLAS: not found") + endif() endif() + endif() - # Apple BLAS library? - if(NOT BLAS_LIBRARIES) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "ACML") + endif() + +endif (BLA_VENDOR MATCHES "ACML.*" OR BLA_VENDOR STREQUAL "All") # ACML + + +# Apple BLAS library? +if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + + if(NOT BLAS_LIBRARIES) + check_fortran_libraries( BLAS_LIBRARIES BLAS - sgemm + dgemm "" "Accelerate" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" + "" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Apple BLAS: found") + else() + message(STATUS "Looking for Apple BLAS: not found") + endif() endif() + endif() - if ( NOT BLAS_LIBRARIES ) - check_fortran_libraries( - BLAS_DEFINITIONS + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Apple Accelerate") + endif() + +endif (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All") + + +if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + + if ( NOT BLAS_LIBRARIES ) + check_fortran_libraries( BLAS_LIBRARIES BLAS - sgemm + dgemm "" "vecLib" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" - ) - endif ( NOT BLAS_LIBRARIES ) - - # Generic BLAS library? - # This configuration *must* be the last try as this library is notably slow. - if ( NOT BLAS_LIBRARIES ) - check_fortran_libraries( - BLAS_DEFINITIONS - BLAS_LIBRARIES - BLAS - sgemm "" - "blas" - "${CGAL_TAUCS_LIBRARIES_DIR} ENV BLAS_LIB_DIR" ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for NAS BLAS: found") + else() + message(STATUS "Looking for NAS BLAS: not found") + endif() endif() + endif () - if(BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "NAS") + endif() + +endif (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All") + + +# Generic BLAS library? +if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + + set(BLAS_SEARCH_LIBS "blas;blas_LINUX;blas_MAC;blas_WINDOWS;refblas") + foreach (SEARCH_LIB ${BLAS_SEARCH_LIBS}) + if (BLAS_LIBRARIES) + else () + check_fortran_libraries( + BLAS_LIBRARIES + BLAS + sgemm + "" + "${SEARCH_LIB}" + "${LGFORTRAN}" + ) + if(NOT BLAS_FIND_QUIETLY) + if(BLAS_LIBRARIES) + message(STATUS "Looking for Generic BLAS: found") + else() + message(STATUS "Looking for Generic BLAS: not found") + endif() + endif() + endif() + endforeach () + + if (BLAS_LIBRARIES AND NOT BLAS_VENDOR_FOUND) + set (BLAS_VENDOR_FOUND "Netlib or other Generic libblas") + endif() + +endif (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All") + + +if(BLA_F95) + + if(BLAS95_LIBRARIES) + set(BLAS95_FOUND TRUE) + else() + set(BLAS95_FOUND FALSE) + endif() + + if(NOT BLAS_FIND_QUIETLY) + if(BLAS95_FOUND) + message(STATUS "A library with BLAS95 API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") + else(BLAS95_FOUND) + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas 95 libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + if(BLAS_FIND_REQUIRED) + message(FATAL_ERROR + "A required library with BLAS95 API not found. Please specify library location.") + else() + message(STATUS + "A library with BLAS95 API not found. Please specify library location.") + endif() + endif(BLAS95_FOUND) + endif(NOT BLAS_FIND_QUIETLY) + + set(BLAS_FOUND TRUE) + set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}") + +else(BLA_F95) + + if(BLAS_LIBRARIES) set(BLAS_FOUND TRUE) else() set(BLAS_FOUND FALSE) @@ -388,32 +1366,41 @@ else() if(NOT BLAS_FIND_QUIETLY) if(BLAS_FOUND) message(STATUS "A library with BLAS API found.") + message(STATUS "BLAS_LIBRARIES ${BLAS_LIBRARIES}") else(BLAS_FOUND) + message(WARNING "BLA_VENDOR has been set to ${BLA_VENDOR} but blas libraries could not be found or check of symbols failed." + "\nPlease indicate where to find blas libraries. You have three options:\n" + "- Option 1: Provide the installation directory of BLAS library with cmake option: -DBLAS_DIR=your/path/to/blas\n" + "- Option 2: Provide the directory where to find BLAS libraries with cmake option: -DBLAS_LIBDIR=your/path/to/blas/libs\n" + "- Option 3: Update your environment variable (Linux: LD_LIBRARY_PATH, Windows: LIB, Mac: DYLD_LIBRARY_PATH)\n" + "\nTo follow libraries detection more precisely you can activate a verbose mode with -DBLAS_VERBOSE=ON at cmake configure." + "\nYou could also specify a BLAS vendor to look for by setting -DBLA_VENDOR=blas_vendor_name." + "\nList of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, Intel10_32 (intel mkl v10 32 bit)," + "Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "Intel( older versions of mkl 32 and 64 bit), ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") if(BLAS_FIND_REQUIRED) - message(FATAL_ERROR "A required library with BLAS API not found. Please specify library location.") + message(FATAL_ERROR + "A required library with BLAS API not found. Please specify library location.") else() - message(STATUS "A library with BLAS API not found. Please specify library location.") + message(STATUS + "A library with BLAS API not found. Please specify library location.") endif() endif(BLAS_FOUND) endif(NOT BLAS_FIND_QUIETLY) - # Add variables to cache - set( BLAS_INCLUDE_DIR "${BLAS_INCLUDE_DIR}" - CACHE PATH "Directories containing the BLAS header files" FORCE ) - set( BLAS_DEFINITIONS "${BLAS_DEFINITIONS}" - CACHE STRING "Compilation options to use BLAS" FORCE ) - set( BLAS_LINKER_FLAGS "${BLAS_LINKER_FLAGS}" - CACHE STRING "Linker flags to use BLAS" FORCE ) - set( BLAS_LIBRARIES "${BLAS_LIBRARIES}" - CACHE FILEPATH "BLAS libraries name" FORCE ) - set( BLAS_LIBRARIES_DIR "${BLAS_LIBRARIES_DIR}" - CACHE PATH "Directories containing the BLAS libraries" FORCE ) +endif(BLA_F95) - #message("DEBUG: BLAS_INCLUDE_DIR = ${BLAS_INCLUDE_DIR}") - #message("DEBUG: BLAS_DEFINITIONS = ${BLAS_DEFINITIONS}") - #message("DEBUG: BLAS_LINKER_FLAGS = ${BLAS_LINKER_FLAGS}") - #message("DEBUG: BLAS_LIBRARIES = ${BLAS_LIBRARIES}") - #message("DEBUG: BLAS_LIBRARIES_DIR = ${BLAS_LIBRARIES_DIR}") - #message("DEBUG: BLAS_FOUND = ${BLAS_FOUND}") +set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}) -endif(BLAS_LIBRARIES_DIR OR BLAS_LIBRARIES) +if (BLAS_FOUND) + list(GET BLAS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)") + string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}") + set(BLAS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of BLAS library" FORCE) + else() + set(BLAS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of BLAS library" FORCE) + endif() +endif() +mark_as_advanced(BLAS_DIR) +mark_as_advanced(BLAS_DIR_FOUND) diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake new file mode 100644 index 000000000..0fe7fb849 --- /dev/null +++ b/cmake/FindBLASEXT.cmake @@ -0,0 +1,380 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find BLAS EXTENDED for MORSE projects: find include dirs and libraries +# +# This module allows to find BLAS libraries by calling the official FindBLAS module +# and handles the creation of different library lists whether the user wishes to link +# with a sequential BLAS or a multihreaded (BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES). +# BLAS is detected with a FindBLAS call then if the BLAS vendor is Intel10_64lp, ACML +# or IBMESSLMT then the module attempts to find the corresponding multithreaded libraries. +# +# The following variables have been added to manage links with sequential or multithreaded +# versions: +# BLAS_INCLUDE_DIRS - BLAS include directories +# BLAS_LIBRARY_DIRS - Link directories for BLAS libraries +# BLAS_SEQ_LIBRARIES - BLAS component libraries to be linked (sequential) +# BLAS_PAR_LIBRARIES - BLAS component libraries to be linked (multithreaded) + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +# macro to factorize this call +macro(find_package_blas) + if(BLASEXT_FIND_REQUIRED) + if(BLASEXT_FIND_QUIETLY) + find_package(BLAS REQUIRED QUIET) + else() + find_package(BLAS REQUIRED) + endif() + else() + if(BLASEXT_FIND_QUIETLY) + find_package(BLAS QUIET) + else() + find_package(BLAS) + endif() + endif() +endmacro() + +# add a cache variable to let the user specify the BLAS vendor +set(BLA_VENDOR "" CACHE STRING "list of possible BLAS vendor: + Open, Eigen, Goto, ATLAS PhiPACK, CXML, DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, + Intel10_32 (intel mkl v10 32 bit), + Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model), + Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model), + Intel( older versions of mkl 32 and 64 bit), + ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") + +if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "In FindBLASEXT") + message(STATUS "If you want to force the use of one specific library, " + "\n please specify the BLAS vendor by setting -DBLA_VENDOR=blas_vendor_name" + "\n at cmake configure.") + message(STATUS "List of possible BLAS vendor: Goto, ATLAS PhiPACK, CXML, " + "\n DXML, SunPerf, SCSL, SGIMATH, IBMESSL, IBMESSLMT, Intel10_32 (intel mkl v10 32 bit)," + "\n Intel10_64lp (intel mkl v10 64 bit, lp thread model, lp64 model)," + "\n Intel10_64lp_seq (intel mkl v10 64 bit, sequential code, lp64 model)," + "\n Intel( older versions of mkl 32 and 64 bit)," + "\n ACML, ACML_MP, ACML_GPU, Apple, NAS, Generic") +endif() + +if (NOT BLAS_FOUND) + # First try to detect two cases: + # 1: only SEQ libs are handled + # 2: both SEQ and PAR libs are handled + find_package_blas() +endif () + +# detect the cases where SEQ and PAR libs are handled +if(BLA_VENDOR STREQUAL "All" AND + (BLAS_mkl_core_LIBRARY OR BLAS_mkl_core_dll_LIBRARY) + ) + set(BLA_VENDOR "Intel") + if(BLAS_mkl_intel_LIBRARY) + set(BLA_VENDOR "Intel10_32") + endif() + if(BLAS_mkl_intel_lp64_LIBRARY) + set(BLA_VENDOR "Intel10_64lp") + endif() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the MKL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_acml_LIBRARY) + set(BLA_VENDOR "ACML") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ACML." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +elseif(BLA_VENDOR STREQUAL "All" AND BLAS_essl_LIBRARY) + set(BLA_VENDOR "IBMESSL") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "A BLAS library has been found (${BLAS_LIBRARIES}) but we" + "\n have also potentially detected some multithreaded BLAS libraries from the ESSL." + "\n We try to find both libraries lists (Sequential/Multithreaded).") + endif() + set(BLAS_FOUND "") +endif() + +# Intel case +if(BLA_VENDOR MATCHES "Intel*") + + ### + # look for include path if the BLAS vendor is Intel + ### + + # gather system include paths + unset(_inc_env) + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + set(ENV_MKLROOT "$ENV{MKLROOT}") + if (ENV_MKLROOT) + list(APPEND _inc_env "${ENV_MKLROOT}/include") + endif() + list(REMOVE_DUPLICATES _inc_env) + + # find mkl.h inside known include paths + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + if(BLAS_INCDIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_INCDIR}) + else() + if(BLAS_DIR) + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${BLAS_DIR} + PATH_SUFFIXES include) + else() + set(BLAS_mkl.h_INCLUDE_DIRS "BLAS_mkl.h_INCLUDE_DIRS-NOTFOUND") + find_path(BLAS_mkl.h_INCLUDE_DIRS + NAMES mkl.h + HINTS ${_inc_env}) + endif() + endif() + mark_as_advanced(BLAS_mkl.h_INCLUDE_DIRS) + ## Print status if not found + ## ------------------------- + #if (NOT BLAS_mkl.h_INCLUDE_DIRS AND MORSE_VERBOSE) + # Print_Find_Header_Status(blas mkl.h) + #endif () + set(BLAS_INCLUDE_DIRS "") + if(BLAS_mkl.h_INCLUDE_DIRS) + list(APPEND BLAS_INCLUDE_DIRS "${BLAS_mkl.h_INCLUDE_DIRS}" ) + endif() + + ### + # look for libs + ### + # if Intel 10 64 bit -> look for sequential and multithreaded versions + if(BLA_VENDOR MATCHES "Intel10_64lp*") + + ## look for the sequential version + set(BLA_VENDOR "Intel10_64lp_seq") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the sequential version Intel10_64lp_seq") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "Intel10_64lp") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "Look for the multithreaded version Intel10_64lp") + endif() + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + else() + + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + endif() + + # ACML case +elseif(BLA_VENDOR MATCHES "ACML*") + + ## look for the sequential version + set(BLA_VENDOR "ACML") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "ACML_MP") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + + # IBMESSL case +elseif(BLA_VENDOR MATCHES "IBMESSL*") + + ## look for the sequential version + set(BLA_VENDOR "IBMESSL") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + + ## look for the multithreaded version + set(BLA_VENDOR "IBMESSLMT") + find_package_blas() + if(BLAS_FOUND) + set(BLAS_PAR_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + endif() + +else() + + if(BLAS_FOUND) + # define the SEQ libs as the BLAS_LIBRARIES + set(BLAS_SEQ_LIBRARIES "${BLAS_LIBRARIES}") + else() + set(BLAS_SEQ_LIBRARIES "${BLAS_SEQ_LIBRARIES-NOTFOUND}") + endif() + set(BLAS_PAR_LIBRARIES "${BLAS_PAR_LIBRARIES-NOTFOUND}") + +endif() + + +if(BLAS_SEQ_LIBRARIES) + set(BLAS_LIBRARIES "${BLAS_SEQ_LIBRARIES}") +endif() + +# extract libs paths +# remark: because it is not given by find_package(BLAS) +set(BLAS_LIBRARY_DIRS "") +string(REPLACE " " ";" BLAS_LIBRARIES "${BLAS_LIBRARIES}") +foreach(blas_lib ${BLAS_LIBRARIES}) + if (EXISTS "${blas_lib}") + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + else() + string(REPLACE "-L" "" blas_lib "${blas_lib}") + if (EXISTS "${blas_lib}") + list(APPEND BLAS_LIBRARY_DIRS "${blas_lib}" ) + else() + get_filename_component(a_blas_lib_dir "${blas_lib}" PATH) + if (EXISTS "${a_blas_lib_dir}") + list(APPEND BLAS_LIBRARY_DIRS "${a_blas_lib_dir}" ) + endif() + endif() + endif() +endforeach() +if (BLAS_LIBRARY_DIRS) + list(REMOVE_DUPLICATES BLAS_LIBRARY_DIRS) +endif () + +# check that BLAS has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +if(BLA_VENDOR MATCHES "Intel*") + if(BLA_VENDOR MATCHES "Intel10_64lp*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is Intel MKL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() + else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS + BLAS_INCLUDE_DIRS) + endif() +elseif(BLA_VENDOR MATCHES "ACML*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ACML:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +elseif(BLA_VENDOR MATCHES "IBMESSL*") + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS found is ESSL:" + "\n we manage two lists of libs, one sequential and one parallel if found" + "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) + if(BLAS_PAR_LIBRARIES) + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_PAR_LIBRARIES) + endif() +else() + if(NOT BLASEXT_FIND_QUIETLY) + message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") + endif() + find_package_handle_standard_args(BLAS DEFAULT_MSG + BLAS_SEQ_LIBRARIES + BLAS_LIBRARY_DIRS) +endif() diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 27e5c9b1f..29f2a5007 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -38,11 +38,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) message(FATAL_ERROR "host compiler - Not found! (gcc version must be at least 4.8)") - # Require the GCC dual ABI to be disabled for 5.1 or higher - elseif (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.1) - set(COMPUTECPP_DISABLE_GCC_DUAL_ABI "True") - message(STATUS - "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION} (note pre 5.1 gcc ABI enabled)") else() message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") endif() @@ -64,6 +59,12 @@ option(COMPUTECPP_64_BIT_CODE "Compile device code in 64 bit mode" ${COMPUTECPP_64_BIT_DEFAULT}) mark_as_advanced(COMPUTECPP_64_BIT_CODE) +option(COMPUTECPP_DISABLE_GCC_DUAL_ABI "Compile with pre-5.1 ABI" OFF) +mark_as_advanced(COMPUTECPP_DISABLE_GCC_DUAL_ABI) + +set(COMPUTECPP_USER_FLAGS "" CACHE STRING "User flags for compute++") +mark_as_advanced(COMPUTECPP_USER_FLAGS) + # Find OpenCL package find_package(OpenCL REQUIRED) @@ -74,7 +75,6 @@ if(NOT COMPUTECPP_PACKAGE_ROOT_DIR) else() message(STATUS "ComputeCpp package - Found") endif() -option(COMPUTECPP_PACKAGE_ROOT_DIR "Path to the ComputeCpp Package") # Obtain the path to compute++ find_program(COMPUTECPP_DEVICE_COMPILER compute++ PATHS @@ -138,8 +138,6 @@ else() message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}") endif() -set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1) - # Check if the platform is supported execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported" OUTPUT_VARIABLE COMPUTECPP_PLATFORM_IS_SUPPORTED @@ -155,6 +153,13 @@ else() endif() endif() +set(COMPUTECPP_USER_FLAGS + -sycl-compress-name + -Wall + -no-serial-memop + -DEIGEN_NO_ASSERTION_CHECKING=1 + ) + #################### # __build_sycl #################### @@ -165,8 +170,11 @@ endif() # targetName : Name of the target. # sourceFile : Source file to be compiled. # binaryDir : Intermediate directory to output the integration header. +# fileCounter : Counter included in name of custom target. Different counter +# values prevent duplicated names of custom target when source files with the same name, +# but located in different directories, are used for the same target. # -function(__build_spir targetName sourceFile binaryDir) +function(__build_spir targetName sourceFile binaryDir fileCounter) # Retrieve source file name. get_filename_component(sourceFileName ${sourceFile} NAME) @@ -175,12 +183,16 @@ function(__build_spir targetName sourceFile binaryDir) set(outputSyclFile ${binaryDir}/${sourceFileName}.sycl) # Add any user-defined include to the device compiler + set(device_compiler_includes "") get_property(includeDirectories DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) - set(device_compiler_includes "") foreach(directory ${includeDirectories}) set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) endforeach() + get_target_property(targetIncludeDirectories ${targetName} INCLUDE_DIRECTORIES) + foreach(directory ${targetIncludeDirectories}) + set(device_compiler_includes "-I${directory}" ${device_compiler_includes}) + endforeach() if (CMAKE_INCLUDE_PATH) foreach(directory ${CMAKE_INCLUDE_PATH}) set(device_compiler_includes "-I${directory}" @@ -188,6 +200,9 @@ function(__build_spir targetName sourceFile binaryDir) endforeach() endif() + set(COMPUTECPP_DEVICE_COMPILER_FLAGS + ${COMPUTECPP_DEVICE_COMPILER_FLAGS} + ${COMPUTECPP_USER_FLAGS}) # Convert argument list format separate_arguments(COMPUTECPP_DEVICE_COMPILER_FLAGS) @@ -201,9 +216,10 @@ function(__build_spir targetName sourceFile binaryDir) ${device_compiler_includes} -o ${outputSyclFile} -c ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} - DEPENDS ${sourceFile} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile} + IMPLICIT_DEPENDS CXX "${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}" WORKING_DIRECTORY ${binaryDir} - COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") + COMMENT "Building ComputeCpp integration header file ${outputSyclFile}") # Add a custom target for the generated integration header add_custom_target(${targetName}_integration_header DEPENDS ${outputSyclFile}) @@ -227,16 +243,21 @@ endfunction() ####################### # # Adds a SYCL compilation custom command associated with an existing -# target and sets a dependancy on that new command. +# target and sets a dependency on that new command. # # targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. # binaryDir : Intermediate directory to output the integration header. +# sourceFiles : Source files to be compiled for SYCL. # -function(add_sycl_to_target targetName sourceFile binaryDir) +function(add_sycl_to_target targetName binaryDir sourceFiles) + set(sourceFiles ${sourceFiles} ${ARGN}) + set(fileCounter 0) # Add custom target to run compute++ and generate the integration header - __build_spir(${targetName} ${sourceFile} ${binaryDir}) + foreach(sourceFile ${sourceFiles}) + __build_spir(${targetName} ${sourceFile} ${binaryDir} ${fileCounter}) + MATH(EXPR fileCounter "${fileCounter} + 1") + endforeach() # Link with the ComputeCpp runtime library target_link_libraries(${targetName} PUBLIC ${COMPUTECPP_RUNTIME_LIBRARY} diff --git a/cmake/FindEigen3.cmake b/cmake/FindEigen3.cmake index 9e9697860..4d9bc1a52 100644 --- a/cmake/FindEigen3.cmake +++ b/cmake/FindEigen3.cmake @@ -10,8 +10,12 @@ # EIGEN3_INCLUDE_DIR - the eigen include directory # EIGEN3_VERSION - eigen version # +# and the following imported target: +# +# Eigen3::Eigen - The header-only Eigen library +# # This module reads hints about search locations from -# the following enviroment variables: +# the following environment variables: # # EIGEN3_ROOT # EIGEN3_ROOT_DIR @@ -64,6 +68,7 @@ if (EIGEN3_INCLUDE_DIR) # in cache already _eigen3_check_version() set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) + set(Eigen3_FOUND ${EIGEN3_VERSION_OK}) else (EIGEN3_INCLUDE_DIR) @@ -95,3 +100,8 @@ else (EIGEN3_INCLUDE_DIR) endif(EIGEN3_INCLUDE_DIR) +if(EIGEN3_FOUND AND NOT TARGET Eigen3::Eigen) + add_library(Eigen3::Eigen INTERFACE IMPORTED) + set_target_properties(Eigen3::Eigen PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${EIGEN3_INCLUDE_DIR}") +endif() diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake new file mode 100644 index 000000000..a831b5c72 --- /dev/null +++ b/cmake/FindHWLOC.cmake @@ -0,0 +1,331 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find HWLOC include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(HWLOC +# [REQUIRED]) # Fail with error if hwloc is not found +# +# This module finds headers and hwloc library. +# Results are reported in variables: +# HWLOC_FOUND - True if headers and requested libraries were found +# HWLOC_INCLUDE_DIRS - hwloc include directories +# HWLOC_LIBRARY_DIRS - Link directories for hwloc libraries +# HWLOC_LIBRARIES - hwloc component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DHWLOC_DIR=path/to/hwloc): +# HWLOC_DIR - Where to find the base directory of hwloc +# HWLOC_INCDIR - Where to find the header files +# HWLOC_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: HWLOC_DIR, HWLOC_INCDIR, HWLOC_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +include(CheckStructHasMember) +include(CheckCSourceCompiles) + +if (NOT HWLOC_FOUND) + set(HWLOC_DIR "" CACHE PATH "Installation directory of HWLOC library") + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "A cache variable, namely HWLOC_DIR, has been set to specify the install directory of HWLOC") + endif() +endif() + +set(ENV_HWLOC_DIR "$ENV{HWLOC_DIR}") +set(ENV_HWLOC_INCDIR "$ENV{HWLOC_INCDIR}") +set(ENV_HWLOC_LIBDIR "$ENV{HWLOC_LIBDIR}") +set(HWLOC_GIVEN_BY_USER "FALSE") +if ( HWLOC_DIR OR ( HWLOC_INCDIR AND HWLOC_LIBDIR) OR ENV_HWLOC_DIR OR (ENV_HWLOC_INCDIR AND ENV_HWLOC_LIBDIR) ) + set(HWLOC_GIVEN_BY_USER "TRUE") +endif() + +# Optionally use pkg-config to detect include/library dirs (if pkg-config is available) +# ------------------------------------------------------------------------------------- +include(FindPkgConfig) +find_package(PkgConfig QUIET) +if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) + + pkg_search_module(HWLOC hwloc) + if (NOT HWLOC_FIND_QUIETLY) + if (HWLOC_FOUND AND HWLOC_LIBRARIES) + message(STATUS "Looking for HWLOC - found using PkgConfig") + #if(NOT HWLOC_INCLUDE_DIRS) + # message("${Magenta}HWLOC_INCLUDE_DIRS is empty using PkgConfig." + # "Perhaps the path to hwloc headers is already present in your" + # "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}") + #endif() + else() + message(STATUS "${Magenta}Looking for HWLOC - not found using PkgConfig." + "\n Perhaps you should add the directory containing hwloc.pc to" + "\n the PKG_CONFIG_PATH environment variable.${ColourReset}") + endif() + endif() + +endif( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) + +if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) ) + + if (NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for HWLOC - PkgConfig not used") + endif() + + # Looking for include + # ------------------- + + # Add system include paths to search include + # ------------------------------------------ + unset(_inc_env) + if(ENV_HWLOC_INCDIR) + list(APPEND _inc_env "${ENV_HWLOC_INCDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _inc_env "${ENV_HWLOC_DIR}") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include") + list(APPEND _inc_env "${ENV_HWLOC_DIR}/include/hwloc") + else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() + endif() + list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") + list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") + list(REMOVE_DUPLICATES _inc_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_inc_env}") + + # Try to find the hwloc header in the given paths + # ------------------------------------------------- + # call cmake macro to find the header path + if(HWLOC_INCDIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_INCDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${HWLOC_DIR} + PATH_SUFFIXES "include" "include/hwloc") + else() + set(HWLOC_hwloc.h_DIRS "HWLOC_hwloc.h_DIRS-NOTFOUND") + find_path(HWLOC_hwloc.h_DIRS + NAMES hwloc.h + HINTS ${PATH_TO_LOOK_FOR} + PATH_SUFFIXES "hwloc") + endif() + endif() + mark_as_advanced(HWLOC_hwloc.h_DIRS) + + # Add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc.h_DIRS) + set(HWLOC_INCLUDE_DIRS "${HWLOC_hwloc.h_DIRS}") + else () + set(HWLOC_INCLUDE_DIRS "HWLOC_INCLUDE_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- hwloc.h not found") + endif() + endif () + + if (HWLOC_INCLUDE_DIRS) + list(REMOVE_DUPLICATES HWLOC_INCLUDE_DIRS) + endif () + + + # Looking for lib + # --------------- + + # Add system library paths to search lib + # -------------------------------------- + unset(_lib_env) + if(ENV_HWLOC_LIBDIR) + list(APPEND _lib_env "${ENV_HWLOC_LIBDIR}") + elseif(ENV_HWLOC_DIR) + list(APPEND _lib_env "${ENV_HWLOC_DIR}") + list(APPEND _lib_env "${ENV_HWLOC_DIR}/lib") + else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() + endif() + list(REMOVE_DUPLICATES _lib_env) + + # set paths where to look for + set(PATH_TO_LOOK_FOR "${_lib_env}") + + # Try to find the hwloc lib in the given paths + # ---------------------------------------------- + + # call cmake macro to find the lib path + if(HWLOC_LIBDIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_LIBDIR}) + else() + if(HWLOC_DIR) + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${HWLOC_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(HWLOC_hwloc_LIBRARY "HWLOC_hwloc_LIBRARY-NOTFOUND") + find_library(HWLOC_hwloc_LIBRARY + NAMES hwloc + HINTS ${PATH_TO_LOOK_FOR}) + endif() + endif() + mark_as_advanced(HWLOC_hwloc_LIBRARY) + + # If found, add path to cmake variable + # ------------------------------------ + if (HWLOC_hwloc_LIBRARY) + get_filename_component(hwloc_lib_path ${HWLOC_hwloc_LIBRARY} PATH) + # set cmake variables (respects naming convention) + set(HWLOC_LIBRARIES "${HWLOC_hwloc_LIBRARY}") + set(HWLOC_LIBRARY_DIRS "${hwloc_lib_path}") + else () + set(HWLOC_LIBRARIES "HWLOC_LIBRARIES-NOTFOUND") + set(HWLOC_LIBRARY_DIRS "HWLOC_LIBRARY_DIRS-NOTFOUND") + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc -- lib hwloc not found") + endif() + endif () + + if (HWLOC_LIBRARY_DIRS) + list(REMOVE_DUPLICATES HWLOC_LIBRARY_DIRS) + endif () + + # check a function to validate the find + if(HWLOC_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # HWLOC + if (HWLOC_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + if (HWLOC_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${HWLOC_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${HWLOC_LIBRARIES}") + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(HWLOC_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(hwloc_topology_init HWLOC_WORKS) + mark_as_advanced(HWLOC_WORKS) + + if(NOT HWLOC_WORKS) + if(NOT HWLOC_FIND_QUIETLY) + message(STATUS "Looking for hwloc : test of hwloc_topology_init with hwloc library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) + endif(HWLOC_LIBRARIES) + +endif( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT HWLOC_FOUND) OR (HWLOC_GIVEN_BY_USER) ) + +if (HWLOC_LIBRARIES) + if (HWLOC_LIBRARY_DIRS) + list(GET HWLOC_LIBRARY_DIRS 0 first_lib_path) + else() + list(GET HWLOC_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + endif() + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(HWLOC_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of HWLOC library" FORCE) + else() + set(HWLOC_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of HWLOC library" FORCE) + endif() +endif() +mark_as_advanced(HWLOC_DIR) +mark_as_advanced(HWLOC_DIR_FOUND) + +# check that HWLOC has been found +# ------------------------------- +include(FindPackageHandleStandardArgs) +if (PKG_CONFIG_EXECUTABLE AND HWLOC_FOUND) + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES) +else() + find_package_handle_standard_args(HWLOC DEFAULT_MSG + HWLOC_LIBRARIES + HWLOC_WORKS) +endif() + +if (HWLOC_FOUND) + set(HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) + list(APPEND CMAKE_REQUIRED_INCLUDES ${HWLOC_INCLUDE_DIRS}) + + # test headers to guess the version + check_struct_has_member( "struct hwloc_obj" parent hwloc.h HAVE_HWLOC_PARENT_MEMBER ) + check_struct_has_member( "struct hwloc_cache_attr_s" size hwloc.h HAVE_HWLOC_CACHE_ATTR ) + check_c_source_compiles( "#include + int main(void) { hwloc_obj_t o; o->type = HWLOC_OBJ_PU; return 0;}" HAVE_HWLOC_OBJ_PU) + include(CheckLibraryExists) + check_library_exists(${HWLOC_LIBRARIES} hwloc_bitmap_free "" HAVE_HWLOC_BITMAP) + + set(CMAKE_REQUIRED_INCLUDES ${HWLOC_SAVE_CMAKE_REQUIRED_INCLUDES}) +endif() diff --git a/cmake/FindKLU.cmake b/cmake/FindKLU.cmake new file mode 100644 index 000000000..4a8f8e0b0 --- /dev/null +++ b/cmake/FindKLU.cmake @@ -0,0 +1,48 @@ +# KLU lib usually requires linking to a blas library. +# It is up to the user of this module to find a BLAS and link to it. + +if (KLU_INCLUDES AND KLU_LIBRARIES) + set(KLU_FIND_QUIETLY TRUE) +endif (KLU_INCLUDES AND KLU_LIBRARIES) + +find_path(KLU_INCLUDES + NAMES + klu.h + PATHS + $ENV{KLUDIR} + ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES + suitesparse + ufsparse +) + +find_library(KLU_LIBRARIES klu PATHS $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + +if(KLU_LIBRARIES) + + if(NOT KLU_LIBDIR) + get_filename_component(KLU_LIBDIR ${KLU_LIBRARIES} PATH) + endif(NOT KLU_LIBDIR) + + find_library(COLAMD_LIBRARY colamd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(COLAMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${COLAMD_LIBRARY}) + endif () + + find_library(AMD_LIBRARY amd PATHS ${KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(AMD_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${AMD_LIBRARY}) + endif () + + find_library(BTF_LIBRARY btf PATHS $ENV{KLU_LIBDIR} $ENV{KLUDIR} ${LIB_INSTALL_DIR}) + if(BTF_LIBRARY) + set(KLU_LIBRARIES ${KLU_LIBRARIES} ${BTF_LIBRARY}) + endif() + +endif(KLU_LIBRARIES) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(KLU DEFAULT_MSG + KLU_INCLUDES KLU_LIBRARIES) + +mark_as_advanced(KLU_INCLUDES KLU_LIBRARIES AMD_LIBRARY COLAMD_LIBRARY BTF_LIBRARY) diff --git a/cmake/FindMetis.cmake b/cmake/FindMetis.cmake index 6a0ce790c..da2f1f1d7 100644 --- a/cmake/FindMetis.cmake +++ b/cmake/FindMetis.cmake @@ -1,59 +1,264 @@ -# Pastix requires METIS or METIS (partitioning and reordering tools) +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find METIS include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(METIS +# [REQUIRED] # Fail with error if metis is not found +# ) +# +# This module finds headers and metis library. +# Results are reported in variables: +# METIS_FOUND - True if headers and requested libraries were found +# METIS_INCLUDE_DIRS - metis include directories +# METIS_LIBRARY_DIRS - Link directories for metis libraries +# METIS_LIBRARIES - metis component libraries to be linked +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis): +# METIS_DIR - Where to find the base directory of metis +# METIS_INCDIR - Where to find the header files +# METIS_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR -if (METIS_INCLUDES AND METIS_LIBRARIES) - set(METIS_FIND_QUIETLY TRUE) -endif (METIS_INCLUDES AND METIS_LIBRARIES) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) -find_path(METIS_INCLUDES - NAMES - metis.h - PATHS - $ENV{METISDIR} - ${INCLUDE_INSTALL_DIR} - PATH_SUFFIXES - . - metis - include -) - -macro(_metis_check_version) - file(READ "${METIS_INCLUDES}/metis.h" _metis_version_header) - - string(REGEX MATCH "define[ \t]+METIS_VER_MAJOR[ \t]+([0-9]+)" _metis_major_version_match "${_metis_version_header}") - set(METIS_MAJOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+METIS_VER_MINOR[ \t]+([0-9]+)" _metis_minor_version_match "${_metis_version_header}") - set(METIS_MINOR_VERSION "${CMAKE_MATCH_1}") - string(REGEX MATCH "define[ \t]+METIS_VER_SUBMINOR[ \t]+([0-9]+)" _metis_subminor_version_match "${_metis_version_header}") - set(METIS_SUBMINOR_VERSION "${CMAKE_MATCH_1}") - if(NOT METIS_MAJOR_VERSION) - message(STATUS "Could not determine Metis version. Assuming version 4.0.0") - set(METIS_VERSION 4.0.0) - else() - set(METIS_VERSION ${METIS_MAJOR_VERSION}.${METIS_MINOR_VERSION}.${METIS_SUBMINOR_VERSION}) +if (NOT METIS_FOUND) + set(METIS_DIR "" CACHE PATH "Installation directory of METIS library") + if (NOT METIS_FIND_QUIETLY) + message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS") endif() - if(${METIS_VERSION} VERSION_LESS ${Metis_FIND_VERSION}) - set(METIS_VERSION_OK FALSE) +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_METIS_DIR "$ENV{METIS_DIR}") +set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}") +if(ENV_METIS_INCDIR) + list(APPEND _inc_env "${ENV_METIS_INCDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _inc_env "${ENV_METIS_DIR}") + list(APPEND _inc_env "${ENV_METIS_DIR}/include") + list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") else() - set(METIS_VERSION_OK TRUE) + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the metis header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(METIS_INCDIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_INCDIR}) +else() + if(METIS_DIR) + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${METIS_DIR} + PATH_SUFFIXES "include" "include/metis") + else() + set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND") + find_path(METIS_metis.h_DIRS + NAMES metis.h + HINTS ${_inc_env}) + endif() +endif() +mark_as_advanced(METIS_metis.h_DIRS) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis.h_DIRS) + set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}") +else () + set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- metis.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}") +if(ENV_METIS_LIBDIR) + list(APPEND _lib_env "${ENV_METIS_LIBDIR}") +elseif(ENV_METIS_DIR) + list(APPEND _lib_env "${ENV_METIS_DIR}") + list(APPEND _lib_env "${ENV_METIS_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the metis lib in the given paths +# ---------------------------------------------- +# call cmake macro to find the lib path +if(METIS_LIBDIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_LIBDIR}) +else() + if(METIS_DIR) + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${METIS_DIR} + PATH_SUFFIXES lib lib32 lib64) + else() + set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND") + find_library(METIS_metis_LIBRARY + NAMES metis + HINTS ${_lib_env}) + endif() +endif() +mark_as_advanced(METIS_metis_LIBRARY) + + +# If found, add path to cmake variable +# ------------------------------------ +if (METIS_metis_LIBRARY) + get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH) + # set cmake variables + set(METIS_LIBRARIES "${METIS_metis_LIBRARY}") + set(METIS_LIBRARY_DIRS "${metis_lib_path}") +else () + set(METIS_LIBRARIES "METIS_LIBRARIES-NOTFOUND") + set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND") + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for metis -- lib metis not found") + endif() +endif () + +# check a function to validate the find +if(METIS_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # METIS + if (METIS_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + if (METIS_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${METIS_LIBRARIES}") + # m + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") endif() - if(NOT METIS_VERSION_OK) - message(STATUS "Metis version ${METIS_VERSION} found in ${METIS_INCLUDES}, " - "but at least version ${Metis_FIND_VERSION} is required") - endif(NOT METIS_VERSION_OK) -endmacro(_metis_check_version) + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") - if(METIS_INCLUDES AND Metis_FIND_VERSION) - _metis_check_version() - else() - set(METIS_VERSION_OK TRUE) + # test link + unset(METIS_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(METIS_NodeND METIS_WORKS) + mark_as_advanced(METIS_WORKS) + + if(NOT METIS_WORKS) + if(NOT METIS_FIND_QUIETLY) + message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(METIS_LIBRARIES) +if (METIS_LIBRARIES) + list(GET METIS_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE) + else() + set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE) + endif() +endif() +mark_as_advanced(METIS_DIR) +mark_as_advanced(METIS_DIR_FOUND) -find_library(METIS_LIBRARIES metis PATHS $ENV{METISDIR} ${LIB_INSTALL_DIR} PATH_SUFFIXES lib) - +# check that METIS has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(METIS DEFAULT_MSG - METIS_INCLUDES METIS_LIBRARIES METIS_VERSION_OK) - -mark_as_advanced(METIS_INCLUDES METIS_LIBRARIES) + METIS_LIBRARIES + METIS_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake new file mode 100644 index 000000000..1396d0582 --- /dev/null +++ b/cmake/FindPTSCOTCH.cmake @@ -0,0 +1,423 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PTSCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PTSCOTCH +# [REQUIRED] # Fail with error if ptscotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PTSCOTCH depends on the following libraries: +# - Threads +# - MPI +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of PT-Scotch with the esmumps interface +# +# This module finds headers and ptscotch library. +# Results are reported in variables: +# PTSCOTCH_FOUND - True if headers and requested libraries were found +# PTSCOTCH_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PTSCOTCH_INCLUDE_DIRS - ptscotch include directories +# PTSCOTCH_LIBRARY_DIRS - Link directories for ptscotch libraries +# PTSCOTCH_LIBRARIES - ptscotch component libraries to be linked +# PTSCOTCH_INCLUDE_DIRS_DEP - ptscotch + dependencies include directories +# PTSCOTCH_LIBRARY_DIRS_DEP - ptscotch + dependencies link directories +# PTSCOTCH_LIBRARIES_DEP - ptscotch libraries + dependencies +# PTSCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPTSCOTCH=path/to/ptscotch): +# PTSCOTCH_DIR - Where to find the base directory of ptscotch +# PTSCOTCH_INCDIR - Where to find the header files +# PTSCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PTSCOTCH_DIR, PTSCOTCH_INCDIR, PTSCOTCH_LIBDIR + +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013-2016 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) + +if (NOT PTSCOTCH_FOUND) + set(PTSCOTCH_DIR "" CACHE PATH "Installation directory of PTSCOTCH library") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely PTSCOTCH_DIR, has been set to specify the install directory of PTSCOTCH") + endif() +endif() + +# Set the version to find +set(PTSCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( PTSCOTCH_FIND_COMPONENTS ) + foreach( component ${PTSCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(PTSCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# PTSCOTCH depends on Threads, try to find it +if (NOT THREADS_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() +endif() + +# PTSCOTCH depends on MPI, try to find it +if (NOT MPI_FOUND) + if (PTSCOTCH_FIND_REQUIRED) + find_package(MPI REQUIRED) + else() + find_package(MPI) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PTSCOTCH_DIR "$ENV{PTSCOTCH_DIR}") +set(ENV_PTSCOTCH_INCDIR "$ENV{PTSCOTCH_INCDIR}") +if(ENV_PTSCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_INCDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_PTSCOTCH_DIR}/include/ptscotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the ptscotch header in the given paths +# ------------------------------------------------- + +set(PTSCOTCH_hdrs_to_find "ptscotch.h;scotch.h") + +# call cmake macro to find the header path +if(PTSCOTCH_INCDIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_INCDIR}) + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + else() + foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") + find_path(PTSCOTCH_${ptscotch_hdr}_DIRS + NAMES ${ptscotch_hdr} + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) + if (PTSCOTCH_${ptscotch_hdr}_DIRS) + list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}") + else () + set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found") + endif() + endif() +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PTSCOTCH_LIBDIR "$ENV{PTSCOTCH_LIBDIR}") +if(ENV_PTSCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_LIBDIR}") +elseif(ENV_PTSCOTCH_DIR) + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}") + list(APPEND _lib_env "${ENV_PTSCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the ptscotch lib in the given paths +# ---------------------------------------------- + +set(PTSCOTCH_libs_to_find "ptscotch;ptscotcherr") +if (PTSCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT PTSCOTCH_libs_to_find 0 "ptesmumps") + list(APPEND PTSCOTCH_libs_to_find "esmumps" ) +endif() +list(APPEND PTSCOTCH_libs_to_find "scotch;scotcherr") + +# call cmake macro to find the lib path +if(PTSCOTCH_LIBDIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_LIBDIR}) + endforeach() +else() + if(PTSCOTCH_DIR) + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") + find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY + NAMES ${ptscotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(PTSCOTCH_LIBRARIES "") +set(PTSCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) + + if (PTSCOTCH_${ptscotch_lib}_LIBRARY) + get_filename_component(${ptscotch_lib}_lib_path "${PTSCOTCH_${ptscotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") + list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}") + else () + list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") + if (NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found") + endif() + endif () + + mark_as_advanced(PTSCOTCH_${ptscotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(PTSCOTCH_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PTSCOTCH + if (PTSCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + if (PTSCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${PTSCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + # MPI + if (MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND CMAKE_REQUIRED_INCLUDES "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PTSCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_dgraphInit PTSCOTCH_WORKS) + mark_as_advanced(PTSCOTCH_WORKS) + + if(PTSCOTCH_WORKS) + # save link with dependencies + set(PTSCOTCH_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PTSCOTCH_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PTSCOTCH_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PTSCOTCH_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PTSCOTCH_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PTSCOTCH_LINKER_FLAGS) + else() + if(NOT PTSCOTCH_FIND_QUIETLY) + message(STATUS "Looking for PTSCOTCH : test of SCOTCH_dgraphInit with PTSCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(PTSCOTCH_LIBRARIES) + +if (PTSCOTCH_LIBRARIES) + list(GET PTSCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PTSCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + else() + set(PTSCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PTSCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(PTSCOTCH_DIR) +mark_as_advanced(PTSCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${PTSCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(PTSCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(PTSCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_4}" PTSCOTCH_Num_4) +if(NOT PTSCOTCH_Num_4) + check_c_source_runs("${PTSCOTCH_C_TEST_SCOTCH_Num_8}" PTSCOTCH_Num_8) + if(NOT PTSCOTCH_Num_8) + set(PTSCOTCH_INTSIZE -1) + else() + set(PTSCOTCH_INTSIZE 8) + endif() +else() + set(PTSCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that PTSCOTCH has been found +# --------------------------------- +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PTSCOTCH DEFAULT_MSG + PTSCOTCH_LIBRARIES + PTSCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/cmake/FindPastix.cmake b/cmake/FindPastix.cmake index e2e6c810d..470477fdc 100644 --- a/cmake/FindPastix.cmake +++ b/cmake/FindPastix.cmake @@ -1,25 +1,704 @@ -# Pastix lib requires linking to a blas library. -# It is up to the user of this module to find a BLAS and link to it. -# Pastix requires SCOTCH or METIS (partitioning and reordering tools) as well +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find PASTIX include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(PASTIX +# [REQUIRED] # Fail with error if pastix is not found +# [COMPONENTS ...] # dependencies +# ) +# +# PASTIX depends on the following libraries: +# - Threads, m, rt +# - MPI +# - HWLOC +# - BLAS +# +# COMPONENTS are optional libraries PASTIX could be linked with, +# Use it to drive detection of a specific compilation chain +# COMPONENTS can be some of the following: +# - MPI: to activate detection of the parallel MPI version (default) +# it looks for Threads, HWLOC, BLAS, MPI and ScaLAPACK libraries +# - SEQ: to activate detection of the sequential version (exclude MPI version) +# - STARPU: to activate detection of StarPU version +# it looks for MPI version of StarPU (default behaviour) +# if SEQ and STARPU are given, it looks for a StarPU without MPI +# - STARPU_CUDA: to activate detection of StarPU with CUDA +# - STARPU_FXT: to activate detection of StarPU with FxT +# - SCOTCH: to activate detection of PASTIX linked with SCOTCH +# - PTSCOTCH: to activate detection of PASTIX linked with SCOTCH +# - METIS: to activate detection of PASTIX linked with SCOTCH +# +# This module finds headers and pastix library. +# Results are reported in variables: +# PASTIX_FOUND - True if headers and requested libraries were found +# PASTIX_LINKER_FLAGS - list of required linker flags (excluding -l and -L) +# PASTIX_INCLUDE_DIRS - pastix include directories +# PASTIX_LIBRARY_DIRS - Link directories for pastix libraries +# PASTIX_LIBRARIES - pastix libraries +# PASTIX_INCLUDE_DIRS_DEP - pastix + dependencies include directories +# PASTIX_LIBRARY_DIRS_DEP - pastix + dependencies link directories +# PASTIX_LIBRARIES_DEP - pastix libraries + dependencies +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DPASTIX_DIR=path/to/pastix): +# PASTIX_DIR - Where to find the base directory of pastix +# PASTIX_INCDIR - Where to find the header files +# PASTIX_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: PASTIX_DIR, PASTIX_INCDIR, PASTIX_LIBDIR -if (PASTIX_INCLUDES AND PASTIX_LIBRARIES) - set(PASTIX_FIND_QUIETLY TRUE) -endif (PASTIX_INCLUDES AND PASTIX_LIBRARIES) - -find_path(PASTIX_INCLUDES - NAMES - pastix_nompi.h - PATHS - $ENV{PASTIXDIR} - ${INCLUDE_INSTALL_DIR} -) - -find_library(PASTIX_LIBRARIES pastix PATHS $ENV{PASTIXDIR} ${LIB_INSTALL_DIR}) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) +if (NOT PASTIX_FOUND) + set(PASTIX_DIR "" CACHE PATH "Installation directory of PASTIX library") + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "A cache variable, namely PASTIX_DIR, has been set to specify the install directory of PASTIX") + endif() +endif() +# Set the version to find +set(PASTIX_LOOK_FOR_MPI ON) +set(PASTIX_LOOK_FOR_SEQ OFF) +set(PASTIX_LOOK_FOR_STARPU OFF) +set(PASTIX_LOOK_FOR_STARPU_CUDA OFF) +set(PASTIX_LOOK_FOR_STARPU_FXT OFF) +set(PASTIX_LOOK_FOR_SCOTCH ON) +set(PASTIX_LOOK_FOR_PTSCOTCH OFF) +set(PASTIX_LOOK_FOR_METIS OFF) + +if( PASTIX_FIND_COMPONENTS ) + foreach( component ${PASTIX_FIND_COMPONENTS} ) + if (${component} STREQUAL "SEQ") + # means we look for the sequential version of PaStiX (without MPI) + set(PASTIX_LOOK_FOR_SEQ ON) + set(PASTIX_LOOK_FOR_MPI OFF) + endif() + if (${component} STREQUAL "MPI") + # means we look for the MPI version of PaStiX (default) + set(PASTIX_LOOK_FOR_SEQ OFF) + set(PASTIX_LOOK_FOR_MPI ON) + endif() + if (${component} STREQUAL "STARPU") + # means we look for PaStiX with StarPU + set(PASTIX_LOOK_FOR_STARPU ON) + endif() + if (${component} STREQUAL "STARPU_CUDA") + # means we look for PaStiX with StarPU + CUDA + set(PASTIX_LOOK_FOR_STARPU ON) + set(PASTIX_LOOK_FOR_STARPU_CUDA ON) + endif() + if (${component} STREQUAL "STARPU_FXT") + # means we look for PaStiX with StarPU + FxT + set(PASTIX_LOOK_FOR_STARPU_FXT ON) + endif() + if (${component} STREQUAL "SCOTCH") + set(PASTIX_LOOK_FOR_SCOTCH ON) + endif() + if (${component} STREQUAL "SCOTCH") + set(PASTIX_LOOK_FOR_PTSCOTCH ON) + endif() + if (${component} STREQUAL "METIS") + set(PASTIX_LOOK_FOR_METIS ON) + endif() + endforeach() +endif() + +# Dependencies detection +# ---------------------- + + +# Required dependencies +# --------------------- + +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect pthread") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(Threads REQUIRED QUIET) +else() + find_package(Threads QUIET) +endif() +set(PASTIX_EXTRA_LIBRARIES "") +if( THREADS_FOUND ) + list(APPEND PASTIX_EXTRA_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) +endif () + +# Add math library to the list of extra +# it normally exists on all common systems provided with a C compiler +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect libm") +endif() +set(PASTIX_M_LIBRARIES "") +if(UNIX OR WIN32) + find_library( + PASTIX_M_m_LIBRARY + NAMES m + ) + mark_as_advanced(PASTIX_M_m_LIBRARY) + if (PASTIX_M_m_LIBRARY) + list(APPEND PASTIX_M_LIBRARIES "${PASTIX_M_m_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_M_m_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find libm on your system." + "Are you sure to a have a C compiler installed?") + endif() + endif() +endif() + +# Try to find librt (libposix4 - POSIX.1b Realtime Extensions library) +# on Unix systems except Apple ones because it does not exist on it +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect librt") +endif() +set(PASTIX_RT_LIBRARIES "") +if(UNIX AND NOT APPLE) + find_library( + PASTIX_RT_rt_LIBRARY + NAMES rt + ) + mark_as_advanced(PASTIX_RT_rt_LIBRARY) + if (PASTIX_RT_rt_LIBRARY) + list(APPEND PASTIX_RT_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + list(APPEND PASTIX_EXTRA_LIBRARIES "${PASTIX_RT_rt_LIBRARY}") + else() + if (PASTIX_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find librt on your system") + endif() + endif() +endif() + +# PASTIX depends on HWLOC +#------------------------ +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect HWLOC") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(HWLOC REQUIRED QUIET) +else() + find_package(HWLOC QUIET) +endif() + +# PASTIX depends on BLAS +#----------------------- +if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect BLAS") +endif() +if (PASTIX_FIND_REQUIRED) + find_package(BLASEXT REQUIRED QUIET) +else() + find_package(BLASEXT QUIET) +endif() + +# Optional dependencies +# --------------------- + +# PASTIX may depend on MPI +#------------------------- +if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect MPI") + endif() + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI) + find_package(MPI REQUIRED QUIET) + else() + find_package(MPI QUIET) + endif() + if (MPI_FOUND) + mark_as_advanced(MPI_LIBRARY) + mark_as_advanced(MPI_EXTRA_LIBRARY) + endif() +endif (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) + +# PASTIX may depend on STARPU +#---------------------------- +if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) + + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect StarPU") + endif() + + set(PASTIX_STARPU_VERSION "1.1" CACHE STRING "oldest STARPU version desired") + + # create list of components in order to make a single call to find_package(starpu...) + # we explicitly need a StarPU version built with hwloc + set(STARPU_COMPONENT_LIST "HWLOC") + + # StarPU may depend on MPI + # allows to use an external mpi compilation by setting compilers with + # -DMPI_C_COMPILER=path/to/mpicc -DMPI_Fortran_COMPILER=path/to/mpif90 + # at cmake configure + if (PASTIX_LOOK_FOR_MPI) + if(NOT MPI_C_COMPILER) + set(MPI_C_COMPILER mpicc) + endif() + list(APPEND STARPU_COMPONENT_LIST "MPI") + endif() + if (PASTIX_LOOK_FOR_STARPU_CUDA) + list(APPEND STARPU_COMPONENT_LIST "CUDA") + endif() + if (PASTIX_LOOK_FOR_STARPU_FXT) + list(APPEND STARPU_COMPONENT_LIST "FXT") + endif() + # set the list of optional dependencies we may discover + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU) + find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED + COMPONENTS ${STARPU_COMPONENT_LIST}) + else() + find_package(STARPU ${PASTIX_STARPU_VERSION} + COMPONENTS ${STARPU_COMPONENT_LIST}) + endif() + +endif( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) + +# PASTIX may depends on SCOTCH +#----------------------------- +if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect SCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH) + find_package(SCOTCH REQUIRED QUIET) + else() + find_package(SCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on PTSCOTCH +#------------------------------- +if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH) + find_package(PTSCOTCH REQUIRED QUIET) + else() + find_package(PTSCOTCH QUIET) + endif() +endif() + +# PASTIX may depends on METIS +#---------------------------- +if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS) + if (NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX - Try to detect METIS") + endif() + if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS) + find_package(METIS REQUIRED QUIET) + else() + find_package(METIS QUIET) + endif() +endif() + +# Error if pastix required and no partitioning lib found +if (PASTIX_FIND_REQUIRED AND NOT SCOTCH_FOUND AND NOT PTSCOTCH_FOUND AND NOT METIS_FOUND) + message(FATAL_ERROR "Could NOT find any partitioning library on your system" + " (install scotch, ptscotch or metis)") +endif() + + +# Looking for PaStiX +# ------------------ + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_PASTIX_DIR "$ENV{PASTIX_DIR}") +set(ENV_PASTIX_INCDIR "$ENV{PASTIX_INCDIR}") +if(ENV_PASTIX_INCDIR) + list(APPEND _inc_env "${ENV_PASTIX_INCDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _inc_env "${ENV_PASTIX_DIR}") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include") + list(APPEND _inc_env "${ENV_PASTIX_DIR}/include/pastix") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) + + +# Try to find the pastix header in the given paths +# --------------------------------------------------- +# call cmake macro to find the header path +if(PASTIX_INCDIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_INCDIR}) +else() + if(PASTIX_DIR) + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${PASTIX_DIR} + PATH_SUFFIXES "include" "include/pastix") + else() + set(PASTIX_pastix.h_DIRS "PASTIX_pastix.h_DIRS-NOTFOUND") + find_path(PASTIX_pastix.h_DIRS + NAMES pastix.h + HINTS ${_inc_env} + PATH_SUFFIXES "pastix") + endif() +endif() +mark_as_advanced(PASTIX_pastix.h_DIRS) + +# If found, add path to cmake variable +# ------------------------------------ +if (PASTIX_pastix.h_DIRS) + set(PASTIX_INCLUDE_DIRS "${PASTIX_pastix.h_DIRS}") +else () + set(PASTIX_INCLUDE_DIRS "PASTIX_INCLUDE_DIRS-NOTFOUND") + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for pastix -- pastix.h not found") + endif() +endif() + + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_PASTIX_LIBDIR "$ENV{PASTIX_LIBDIR}") +if(ENV_PASTIX_LIBDIR) + list(APPEND _lib_env "${ENV_PASTIX_LIBDIR}") +elseif(ENV_PASTIX_DIR) + list(APPEND _lib_env "${ENV_PASTIX_DIR}") + list(APPEND _lib_env "${ENV_PASTIX_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the pastix lib in the given paths +# ------------------------------------------------ + +# create list of libs to find +set(PASTIX_libs_to_find "pastix_murge;pastix") + +# call cmake macro to find the lib path +if(PASTIX_LIBDIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_LIBDIR}) + endforeach() +else() + if(PASTIX_DIR) + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${PASTIX_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(pastix_lib ${PASTIX_libs_to_find}) + set(PASTIX_${pastix_lib}_LIBRARY "PASTIX_${pastix_lib}_LIBRARY-NOTFOUND") + find_library(PASTIX_${pastix_lib}_LIBRARY + NAMES ${pastix_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +# If found, add path to cmake variable +# ------------------------------------ +foreach(pastix_lib ${PASTIX_libs_to_find}) + + get_filename_component(${pastix_lib}_lib_path ${PASTIX_${pastix_lib}_LIBRARY} PATH) + # set cmake variables (respects naming convention) + if (PASTIX_LIBRARIES) + list(APPEND PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + else() + set(PASTIX_LIBRARIES "${PASTIX_${pastix_lib}_LIBRARY}") + endif() + if (PASTIX_LIBRARY_DIRS) + list(APPEND PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + else() + set(PASTIX_LIBRARY_DIRS "${${pastix_lib}_lib_path}") + endif() + mark_as_advanced(PASTIX_${pastix_lib}_LIBRARY) + +endforeach(pastix_lib ${PASTIX_libs_to_find}) + +# check a function to validate the find +if(PASTIX_LIBRARIES) + + set(REQUIRED_LDFLAGS) + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # PASTIX + if (PASTIX_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${PASTIX_INCLUDE_DIRS}") + endif() + foreach(libdir ${PASTIX_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + set(REQUIRED_LIBS "${PASTIX_LIBRARIES}") + # STARPU + if (PASTIX_LOOK_FOR_STARPU AND STARPU_FOUND) + if (STARPU_INCLUDE_DIRS_DEP) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS_DEP}") + elseif (STARPU_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${STARPU_INCLUDE_DIRS}") + endif() + if(STARPU_LIBRARY_DIRS_DEP) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS_DEP}") + elseif(STARPU_LIBRARY_DIRS) + list(APPEND REQUIRED_LIBDIRS "${STARPU_LIBRARY_DIRS}") + endif() + if (STARPU_LIBRARIES_DEP) + list(APPEND REQUIRED_LIBS "${STARPU_LIBRARIES_DEP}") + elseif (STARPU_LIBRARIES) + foreach(lib ${STARPU_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + endif() + # CUDA + if (PASTIX_LOOK_FOR_STARPU_CUDA AND CUDA_FOUND) + if (CUDA_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${CUDA_INCLUDE_DIRS}") + endif() + foreach(libdir ${CUDA_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${CUDA_CUBLAS_LIBRARIES};${CUDA_LIBRARIES}") + endif() + # MPI + if (PASTIX_LOOK_FOR_MPI AND MPI_FOUND) + if (MPI_C_INCLUDE_PATH) + list(APPEND REQUIRED_INCDIRS "${MPI_C_INCLUDE_PATH}") + endif() + if (MPI_C_LINK_FLAGS) + if (${MPI_C_LINK_FLAGS} MATCHES " -") + string(REGEX REPLACE " -" "-" MPI_C_LINK_FLAGS ${MPI_C_LINK_FLAGS}) + endif() + list(APPEND REQUIRED_LDFLAGS "${MPI_C_LINK_FLAGS}") + endif() + list(APPEND REQUIRED_LIBS "${MPI_C_LIBRARIES}") + endif() + # HWLOC + if (HWLOC_FOUND) + if (HWLOC_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${HWLOC_INCLUDE_DIRS}") + endif() + foreach(libdir ${HWLOC_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + foreach(lib ${HWLOC_LIBRARIES}) + if (EXISTS ${lib} OR ${lib} MATCHES "^-") + list(APPEND REQUIRED_LIBS "${lib}") + else() + list(APPEND REQUIRED_LIBS "-l${lib}") + endif() + endforeach() + endif() + # BLAS + if (BLAS_FOUND) + if (BLAS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${BLAS_INCLUDE_DIRS}") + endif() + foreach(libdir ${BLAS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${BLAS_LIBRARIES}") + if (BLAS_LINKER_FLAGS) + list(APPEND REQUIRED_LDFLAGS "${BLAS_LINKER_FLAGS}") + endif() + endif() + # SCOTCH + if (PASTIX_LOOK_FOR_SCOTCH AND SCOTCH_FOUND) + if (SCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${SCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + endif() + # PTSCOTCH + if (PASTIX_LOOK_FOR_PTSCOTCH AND PTSCOTCH_FOUND) + if (PTSCOTCH_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${PTSCOTCH_INCLUDE_DIRS}") + endif() + foreach(libdir ${PTSCOTCH_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${PTSCOTCH_LIBRARIES}") + endif() + # METIS + if (PASTIX_LOOK_FOR_METIS AND METIS_FOUND) + if (METIS_INCLUDE_DIRS) + list(APPEND REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}") + endif() + foreach(libdir ${METIS_LIBRARY_DIRS}) + if (libdir) + list(APPEND REQUIRED_LIBDIRS "${libdir}") + endif() + endforeach() + list(APPEND REQUIRED_LIBS "${METIS_LIBRARIES}") + endif() + # Fortran + if (CMAKE_C_COMPILER_ID MATCHES "GNU") + find_library( + FORTRAN_gfortran_LIBRARY + NAMES gfortran + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_gfortran_LIBRARY) + if (FORTRAN_gfortran_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_gfortran_LIBRARY}") + endif() + elseif (CMAKE_C_COMPILER_ID MATCHES "Intel") + find_library( + FORTRAN_ifcore_LIBRARY + NAMES ifcore + HINTS ${_lib_env} + ) + mark_as_advanced(FORTRAN_ifcore_LIBRARY) + if (FORTRAN_ifcore_LIBRARY) + list(APPEND REQUIRED_LIBS "${FORTRAN_ifcore_LIBRARY}") + endif() + endif() + # EXTRA LIBS such that pthread, m, rt + list(APPEND REQUIRED_LIBS ${PASTIX_EXTRA_LIBRARIES}) + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}") + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(PASTIX_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(pastix PASTIX_WORKS) + mark_as_advanced(PASTIX_WORKS) + + if(PASTIX_WORKS) + # save link with dependencies + set(PASTIX_LIBRARIES_DEP "${REQUIRED_LIBS}") + set(PASTIX_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}") + set(PASTIX_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}") + set(PASTIX_LINKER_FLAGS "${REQUIRED_LDFLAGS}") + list(REMOVE_DUPLICATES PASTIX_LIBRARY_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_INCLUDE_DIRS_DEP) + list(REMOVE_DUPLICATES PASTIX_LINKER_FLAGS) + else() + if(NOT PASTIX_FIND_QUIETLY) + message(STATUS "Looking for PASTIX : test of pastix() fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + message(STATUS "Maybe PASTIX is linked with specific libraries. " + "Have you tried with COMPONENTS (MPI/SEQ, STARPU, STARPU_CUDA, SCOTCH, PTSCOTCH, METIS)? " + "See the explanation in FindPASTIX.cmake.") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(PASTIX_LIBRARIES) + +if (PASTIX_LIBRARIES) + list(GET PASTIX_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(PASTIX_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of PASTIX library" FORCE) + else() + set(PASTIX_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of PASTIX library" FORCE) + endif() +endif() +mark_as_advanced(PASTIX_DIR) +mark_as_advanced(PASTIX_DIR_FOUND) + +# check that PASTIX has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(PASTIX DEFAULT_MSG - PASTIX_INCLUDES PASTIX_LIBRARIES) - -mark_as_advanced(PASTIX_INCLUDES PASTIX_LIBRARIES) + PASTIX_LIBRARIES + PASTIX_WORKS) diff --git a/cmake/FindScotch.cmake b/cmake/FindScotch.cmake index 530340b16..89d295ac2 100644 --- a/cmake/FindScotch.cmake +++ b/cmake/FindScotch.cmake @@ -1,24 +1,369 @@ -# Pastix requires SCOTCH or METIS (partitioning and reordering tools) +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# - Find SCOTCH include dirs and libraries +# Use this module by invoking find_package with the form: +# find_package(SCOTCH +# [REQUIRED] # Fail with error if scotch is not found +# [COMPONENTS ...] # dependencies +# ) +# +# COMPONENTS can be some of the following: +# - ESMUMPS: to activate detection of Scotch with the esmumps interface +# +# This module finds headers and scotch library. +# Results are reported in variables: +# SCOTCH_FOUND - True if headers and requested libraries were found +# SCOTCH_INCLUDE_DIRS - scotch include directories +# SCOTCH_LIBRARY_DIRS - Link directories for scotch libraries +# SCOTCH_LIBRARIES - scotch component libraries to be linked +# SCOTCH_INTSIZE - Number of octets occupied by a SCOTCH_Num +# +# The user can give specific paths where to find the libraries adding cmake +# options at configure (ex: cmake path/to/project -DSCOTCH=path/to/scotch): +# SCOTCH_DIR - Where to find the base directory of scotch +# SCOTCH_INCDIR - Where to find the header files +# SCOTCH_LIBDIR - Where to find the library files +# The module can also look for the following environment variables if paths +# are not given as cmake variable: SCOTCH_DIR, SCOTCH_INCDIR, SCOTCH_LIBDIR -if (SCOTCH_INCLUDES AND SCOTCH_LIBRARIES) - set(SCOTCH_FIND_QUIETLY TRUE) -endif (SCOTCH_INCLUDES AND SCOTCH_LIBRARIES) +#============================================================================= +# Copyright 2012-2013 Inria +# Copyright 2012-2013 Emmanuel Agullo +# Copyright 2012-2013 Mathieu Faverge +# Copyright 2012 Cedric Castagnede +# Copyright 2013 Florent Pruvost +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file MORSE-Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of Morse, substitute the full +# License text for the above reference.) -find_path(SCOTCH_INCLUDES - NAMES - scotch.h - PATHS - $ENV{SCOTCHDIR} - ${INCLUDE_INSTALL_DIR} - PATH_SUFFIXES - scotch -) +if (NOT SCOTCH_FOUND) + set(SCOTCH_DIR "" CACHE PATH "Installation directory of SCOTCH library") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "A cache variable, namely SCOTCH_DIR, has been set to specify the install directory of SCOTCH") + endif() +endif() + +# Set the version to find +set(SCOTCH_LOOK_FOR_ESMUMPS OFF) + +if( SCOTCH_FIND_COMPONENTS ) + foreach( component ${SCOTCH_FIND_COMPONENTS} ) + if (${component} STREQUAL "ESMUMPS") + # means we look for esmumps library + set(SCOTCH_LOOK_FOR_ESMUMPS ON) + endif() + endforeach() +endif() + +# SCOTCH may depend on Threads, try to find it +if (NOT THREADS_FOUND) + if (SCOTCH_FIND_REQUIRED) + find_package(Threads REQUIRED) + else() + find_package(Threads) + endif() +endif() + +# Looking for include +# ------------------- + +# Add system include paths to search include +# ------------------------------------------ +unset(_inc_env) +set(ENV_SCOTCH_DIR "$ENV{SCOTCH_DIR}") +set(ENV_SCOTCH_INCDIR "$ENV{SCOTCH_INCDIR}") +if(ENV_SCOTCH_INCDIR) + list(APPEND _inc_env "${ENV_SCOTCH_INCDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _inc_env "${ENV_SCOTCH_DIR}") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include") + list(APPEND _inc_env "${ENV_SCOTCH_DIR}/include/scotch") +else() + if(WIN32) + string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}") + else() + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{CPATH}") + list(APPEND _inc_env "${_path_env}") + string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}") + list(APPEND _inc_env "${_path_env}") + endif() +endif() +list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}") +list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}") +list(REMOVE_DUPLICATES _inc_env) -find_library(SCOTCH_LIBRARIES scotch PATHS $ENV{SCOTCHDIR} ${LIB_INSTALL_DIR}) +# Try to find the scotch header in the given paths +# ------------------------------------------------- +# call cmake macro to find the header path +if(SCOTCH_INCDIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_INCDIR}) +else() + if(SCOTCH_DIR) + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") + else() + set(SCOTCH_scotch.h_DIRS "SCOTCH_scotch.h_DIRS-NOTFOUND") + find_path(SCOTCH_scotch.h_DIRS + NAMES scotch.h + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") + endif() +endif() +mark_as_advanced(SCOTCH_scotch.h_DIRS) +# If found, add path to cmake variable +# ------------------------------------ +if (SCOTCH_scotch.h_DIRS) + set(SCOTCH_INCLUDE_DIRS "${SCOTCH_scotch.h_DIRS}") +else () + set(SCOTCH_INCLUDE_DIRS "SCOTCH_INCLUDE_DIRS-NOTFOUND") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- scotch.h not found") + endif() +endif() +list(REMOVE_DUPLICATES SCOTCH_INCLUDE_DIRS) + +# Looking for lib +# --------------- + +# Add system library paths to search lib +# -------------------------------------- +unset(_lib_env) +set(ENV_SCOTCH_LIBDIR "$ENV{SCOTCH_LIBDIR}") +if(ENV_SCOTCH_LIBDIR) + list(APPEND _lib_env "${ENV_SCOTCH_LIBDIR}") +elseif(ENV_SCOTCH_DIR) + list(APPEND _lib_env "${ENV_SCOTCH_DIR}") + list(APPEND _lib_env "${ENV_SCOTCH_DIR}/lib") +else() + if(WIN32) + string(REPLACE ":" ";" _lib_env "$ENV{LIB}") + else() + if(APPLE) + string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}") + else() + string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}") + endif() + list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}") + list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}") + endif() +endif() +list(REMOVE_DUPLICATES _lib_env) + +# Try to find the scotch lib in the given paths +# ---------------------------------------------- + +set(SCOTCH_libs_to_find "scotch;scotcherrexit") +if (SCOTCH_LOOK_FOR_ESMUMPS) + list(INSERT SCOTCH_libs_to_find 0 "esmumps") +endif() + +# call cmake macro to find the lib path +if(SCOTCH_LIBDIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_LIBDIR}) + endforeach() +else() + if(SCOTCH_DIR) + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${SCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) + endforeach() + else() + foreach(scotch_lib ${SCOTCH_libs_to_find}) + set(SCOTCH_${scotch_lib}_LIBRARY "SCOTCH_${scotch_lib}_LIBRARY-NOTFOUND") + find_library(SCOTCH_${scotch_lib}_LIBRARY + NAMES ${scotch_lib} + HINTS ${_lib_env}) + endforeach() + endif() +endif() + +set(SCOTCH_LIBRARIES "") +set(SCOTCH_LIBRARY_DIRS "") +# If found, add path to cmake variable +# ------------------------------------ +foreach(scotch_lib ${SCOTCH_libs_to_find}) + + if (SCOTCH_${scotch_lib}_LIBRARY) + get_filename_component(${scotch_lib}_lib_path "${SCOTCH_${scotch_lib}_LIBRARY}" PATH) + # set cmake variables + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + list(APPEND SCOTCH_LIBRARY_DIRS "${${scotch_lib}_lib_path}") + else () + list(APPEND SCOTCH_LIBRARIES "${SCOTCH_${scotch_lib}_LIBRARY}") + if (NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for scotch -- lib ${scotch_lib} not found") + endif() + endif () + + mark_as_advanced(SCOTCH_${scotch_lib}_LIBRARY) + +endforeach() +list(REMOVE_DUPLICATES SCOTCH_LIBRARY_DIRS) + +# check a function to validate the find +if(SCOTCH_LIBRARIES) + + set(REQUIRED_INCDIRS) + set(REQUIRED_LIBDIRS) + set(REQUIRED_LIBS) + + # SCOTCH + if (SCOTCH_INCLUDE_DIRS) + set(REQUIRED_INCDIRS "${SCOTCH_INCLUDE_DIRS}") + endif() + if (SCOTCH_LIBRARY_DIRS) + set(REQUIRED_LIBDIRS "${SCOTCH_LIBRARY_DIRS}") + endif() + set(REQUIRED_LIBS "${SCOTCH_LIBRARIES}") + # THREADS + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}") + endif() + set(Z_LIBRARY "Z_LIBRARY-NOTFOUND") + find_library(Z_LIBRARY NAMES z) + mark_as_advanced(Z_LIBRARY) + if(Z_LIBRARY) + list(APPEND REQUIRED_LIBS "-lz") + endif() + set(M_LIBRARY "M_LIBRARY-NOTFOUND") + find_library(M_LIBRARY NAMES m) + mark_as_advanced(M_LIBRARY) + if(M_LIBRARY) + list(APPEND REQUIRED_LIBS "-lm") + endif() + set(RT_LIBRARY "RT_LIBRARY-NOTFOUND") + find_library(RT_LIBRARY NAMES rt) + mark_as_advanced(RT_LIBRARY) + if(RT_LIBRARY) + list(APPEND REQUIRED_LIBS "-lrt") + endif() + + # set required libraries for link + set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}") + set(CMAKE_REQUIRED_LIBRARIES) + foreach(lib_dir ${REQUIRED_LIBDIRS}) + list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}") + endforeach() + list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}") + string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}") + + # test link + unset(SCOTCH_WORKS CACHE) + include(CheckFunctionExists) + check_function_exists(SCOTCH_graphInit SCOTCH_WORKS) + mark_as_advanced(SCOTCH_WORKS) + + if(SCOTCH_WORKS) + # save link with dependencies + set(SCOTCH_LIBRARIES "${REQUIRED_LIBS}") + else() + if(NOT SCOTCH_FIND_QUIETLY) + message(STATUS "Looking for SCOTCH : test of SCOTCH_graphInit with SCOTCH library fails") + message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}") + message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}") + message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails") + endif() + endif() + set(CMAKE_REQUIRED_INCLUDES) + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LIBRARIES) +endif(SCOTCH_LIBRARIES) + +if (SCOTCH_LIBRARIES) + list(GET SCOTCH_LIBRARIES 0 first_lib) + get_filename_component(first_lib_path "${first_lib}" PATH) + if (${first_lib_path} MATCHES "/lib(32|64)?$") + string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}") + set(SCOTCH_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + else() + set(SCOTCH_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of SCOTCH library" FORCE) + endif() +endif() +mark_as_advanced(SCOTCH_DIR) +mark_as_advanced(SCOTCH_DIR_FOUND) + +# Check the size of SCOTCH_Num +# --------------------------------- +set(CMAKE_REQUIRED_INCLUDES ${SCOTCH_INCLUDE_DIRS}) + +include(CheckCSourceRuns) +#stdio.h and stdint.h should be included by scotch.h directly +set(SCOTCH_C_TEST_SCOTCH_Num_4 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 4) + return 0; + else + return 1; +} +") + +set(SCOTCH_C_TEST_SCOTCH_Num_8 " +#include +#include +#include +int main(int argc, char **argv) { + if (sizeof(SCOTCH_Num) == 8) + return 0; + else + return 1; +} +") +check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_4}" SCOTCH_Num_4) +if(NOT SCOTCH_Num_4) + check_c_source_runs("${SCOTCH_C_TEST_SCOTCH_Num_8}" SCOTCH_Num_8) + if(NOT SCOTCH_Num_8) + set(SCOTCH_INTSIZE -1) + else() + set(SCOTCH_INTSIZE 8) + endif() +else() + set(SCOTCH_INTSIZE 4) +endif() +set(CMAKE_REQUIRED_INCLUDES "") + +# check that SCOTCH has been found +# --------------------------------- include(FindPackageHandleStandardArgs) find_package_handle_standard_args(SCOTCH DEFAULT_MSG - SCOTCH_INCLUDES SCOTCH_LIBRARIES) - -mark_as_advanced(SCOTCH_INCLUDES SCOTCH_LIBRARIES) + SCOTCH_LIBRARIES + SCOTCH_WORKS) +# +# TODO: Add possibility to check for specific functions in the library +# diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake new file mode 100644 index 000000000..cb2154192 --- /dev/null +++ b/cmake/FindTriSYCL.cmake @@ -0,0 +1,152 @@ +#.rst: +# FindTriSYCL +#--------------- +# +# TODO : insert Copyright and licence + +######################### +# FindTriSYCL.cmake +######################### +# +# Tools for finding and building with TriSYCL. +# +# User must define TRISYCL_INCLUDE_DIR pointing to the triSYCL +# include directory. +# +# Latest version of this file can be found at: +# https://github.com/triSYCL/triSYCL + +# Requite CMake version 3.5 or higher +cmake_minimum_required (VERSION 3.5) + +# Check that a supported host compiler can be found +if(CMAKE_COMPILER_IS_GNUCXX) + # Require at least gcc 5.4 + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4) + message(FATAL_ERROR + "host compiler - Not found! (gcc version must be at least 5.4)") + else() + message(STATUS "host compiler - gcc ${CMAKE_CXX_COMPILER_VERSION}") + endif() +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + # Require at least clang 3.9 + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.9) + message(FATAL_ERROR + "host compiler - Not found! (clang version must be at least 3.9)") + else() + message(STATUS "host compiler - clang ${CMAKE_CXX_COMPILER_VERSION}") + endif() +else() + message(WARNING + "host compiler - Not found! (triSYCL supports GCC and Clang)") +endif() + +#triSYCL options +option(TRISYCL_OPENMP "triSYCL multi-threading with OpenMP" ON) +option(TRISYCL_OPENCL "triSYCL OpenCL interoperability mode" OFF) +option(TRISYCL_NO_ASYNC "triSYCL use synchronous kernel execution" OFF) +option(TRISYCL_DEBUG "triSCYL use debug mode" OFF) +option(TRISYCL_DEBUG_STRUCTORS "triSYCL trace of object lifetimes" OFF) +option(TRISYCL_TRACE_KERNEL "triSYCL trace of kernel execution" OFF) + +mark_as_advanced(TRISYCL_OPENMP) +mark_as_advanced(TRISYCL_OPENCL) +mark_as_advanced(TRISYCL_NO_ASYNC) +mark_as_advanced(TRISYCL_DEBUG) +mark_as_advanced(TRISYCL_DEBUG_STRUCTORS) +mark_as_advanced(TRISYCL_TRACE_KERNEL) + +#triSYCL definitions +set(CL_SYCL_LANGUAGE_VERSION 220 CACHE VERSION + "Host language version to be used by trisYCL (default is: 220)") +set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE VERSION + "Device language version to be used by trisYCL (default is: 220)") +#set(TRISYCL_COMPILE_OPTIONS "-std=c++1z -Wall -Wextra") +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED ON) + + +# Find OpenCL package +if(TRISYCL_OPENCL) + find_package(OpenCL REQUIRED) + if(UNIX) + set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH + "Path to Boost.Compute headers (default is: /usr/include/compute)") + endif(UNIX) +endif() + +# Find OpenMP package +if(TRISYCL_OPENMP) + find_package(OpenMP REQUIRED) +endif() + +# Find Boost +find_package(Boost 1.58 REQUIRED COMPONENTS chrono log) + +# If debug or trace we need boost log +if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL) + set(LOG_NEEDED ON) +else() + set(LOG_NEEDED OFF) +endif() + +find_package(Threads REQUIRED) + +# Find triSYCL directory +if(NOT TRISYCL_INCLUDE_DIR) + message(FATAL_ERROR + "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR") +else() + message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}") +endif() + +####################### +# add_sycl_to_target +####################### +# +# Sets the proper flags and includes for the target compilation. +# +# targetName : Name of the target to add a SYCL to. +# sourceFile : Source file to be compiled for SYCL. +# binaryDir : Intermediate directory to output the integration header. +# +function(add_sycl_to_target targetName sourceFile binaryDir) + + # Add include directories to the "#include <>" paths + target_include_directories (${targetName} PUBLIC + ${TRISYCL_INCLUDE_DIR} + ${Boost_INCLUDE_DIRS} + $<$:${OpenCL_INCLUDE_DIRS}> + $<$:${BOOST_COMPUTE_INCPATH}>) + + + # Link dependencies + target_link_libraries(${targetName} PUBLIC + $<$:${OpenCL_LIBRARIES}> + Threads::Threads + $<$:Boost::log> + Boost::chrono) + + + # Compile definitions + target_compile_definitions(${targetName} PUBLIC + $<$:TRISYCL_NO_ASYNC> + $<$:TRISYCL_OPENCL> + $<$:TRISYCL_DEBUG> + $<$:TRISYCL_DEBUG_STRUCTORS> + $<$:TRISYCL_TRACE_KERNEL> + $<$:BOOST_LOG_DYN_LINK>) + + # C++ and OpenMP requirements + target_compile_options(${targetName} PUBLIC + ${TRISYCL_COMPILE_OPTIONS} + $<$:${OpenMP_CXX_FLAGS}>) + + if(${TRISYCL_OPENMP} AND (NOT WIN32)) + # Does not support generator expressions + set_target_properties(${targetName} + PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) + endif(${TRISYCL_OPENMP} AND (NOT WIN32)) + +endfunction(add_sycl_to_target) diff --git a/cmake/language_support.cmake b/cmake/language_support.cmake index 2f14f30b8..ddba50945 100644 --- a/cmake/language_support.cmake +++ b/cmake/language_support.cmake @@ -26,7 +26,7 @@ function(workaround_9220 language language_works) cmake_minimum_required(VERSION 2.8.0) set (CMAKE_Fortran_FLAGS \"${CMAKE_Fortran_FLAGS}\") set (CMAKE_EXE_LINKER_FLAGS \"${CMAKE_EXE_LINKER_FLAGS}\") - enable_language(${language} OPTIONAL) + enable_language(${language}) ") file(REMOVE_RECURSE ${CMAKE_BINARY_DIR}/language_tests/${language}) file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/language_tests/${language}) diff --git a/debug/msvc/eigen_autoexp_part.dat b/debug/msvc/eigen_autoexp_part.dat index 07aa43739..35ef5807c 100644 --- a/debug/msvc/eigen_autoexp_part.dat +++ b/debug/msvc/eigen_autoexp_part.dat @@ -14,7 +14,7 @@ ; * - Eigen::Matrix<*,-1,+,*,*,*> ; * - Eigen::Matrix<*,+,+,*,*,*> ; * -; * Matrices are displayed properly independantly of the memory +; * Matrices are displayed properly independently of the memory ; * alignment (RowMajor vs. ColMajor). ; * ; * This file is distributed WITHOUT ANY WARRANTY. Please ensure diff --git a/doc/AsciiQuickReference.txt b/doc/AsciiQuickReference.txt index 8409f8850..0ca54cef3 100644 --- a/doc/AsciiQuickReference.txt +++ b/doc/AsciiQuickReference.txt @@ -140,7 +140,7 @@ R.array().abs() // abs(P) R.cwiseAbs2() // abs(P.^2) R.array().abs2() // abs(P.^2) (R.array() < s).select(P,Q ); // (R < s ? P : Q) -R = (Q.array()==0).select(P,A) // R(Q==0) = P(Q==0) +R = (Q.array()==0).select(P,R) // R(Q==0) = P(Q==0) R = P.unaryExpr(ptr_fun(func)) // R = arrayfun(func, P) // with: scalar func(const scalar &x); diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 2109978fe..49b9fba39 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1764,7 +1764,7 @@ UML_LOOK = YES # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more -# managable. Set this to 0 for no limit. Note that the threshold may be +# manageable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index 152dda47d..6b4e49214 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -79,7 +79,7 @@ These examples are just intended to give the reader a first impression of how fu \section TopicUsingRefClass How to write generic, but non-templated function? -In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated function and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This exactly the purpose of the Ref class. Here is a simple example: +In all the previous examples, the functions had to be template functions. This approach allows to write very generic code, but it is often desirable to write non templated functions and still keep some level of genericity to avoid stupid copies of the arguments. The typical example is to write functions accepting both a MatrixXf or a block of a MatrixXf. This is exactly the purpose of the Ref class. Here is a simple example: @@ -133,7 +133,7 @@ In this special case, the example is fine and will be working because both param \section TopicPlainFunctionsFailing In which cases do functions taking a plain Matrix or Array argument fail? -Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const paramter which allows us to store the result. A first naive implementation might look as follows. +Here, we consider a slightly modified version of the function given above. This time, we do not want to return the result but pass an additional non-const parameter which allows us to store the result. A first naive implementation might look as follows. \code // Note: This code is flawed! void cov(const MatrixXf& x, const MatrixXf& y, MatrixXf& C) @@ -176,7 +176,7 @@ The implementation above does now not only work with temporary expressions but i \section TopicResizingInGenericImplementations How to resize matrices in generic implementations? -One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the follwing code to work +One might think we are done now, right? This is not completely true because in order for our covariance function to be generically applicable, we want the following code to work \code MatrixXf x = MatrixXf::Random(100,3); MatrixXf y = MatrixXf::Random(100,3); diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox index e2191a22f..24dfe4b4f 100644 --- a/doc/LeastSquares.dox +++ b/doc/LeastSquares.dox @@ -16,7 +16,7 @@ equations is the fastest but least accurate, and the QR decomposition is in betw \section LeastSquaresSVD Using the SVD decomposition -The \link JacobiSVD::solve() solve() \endlink method in the JacobiSVD class can be directly used to +The \link BDCSVD::solve() solve() \endlink method in the BDCSVD class can be directly used to solve linear squares systems. It is not enough to compute only the singular values (the default for this class); you also need the singular vectors but the thin SVD decomposition suffices for computing least squares solutions: diff --git a/doc/Pitfalls.dox b/doc/Pitfalls.dox index cf42effef..3f395053d 100644 --- a/doc/Pitfalls.dox +++ b/doc/Pitfalls.dox @@ -2,10 +2,16 @@ namespace Eigen { /** \page TopicPitfalls Common pitfalls + \section TopicPitfalls_template_keyword Compilation error with template methods See this \link TopicTemplateKeyword page \endlink. +\section TopicPitfalls_aliasing Aliasing + +Don't miss this \link TopicAliasing page \endlink on aliasing, +especially if you got wrong results in statements where the destination appears on the right hand side of the expression. + \section TopicPitfalls_auto_keyword C++11 and the auto keyword In short: do not use the auto keywords with Eigen's expressions, unless you are 100% sure about what you are doing. In particular, do not use the auto keyword as a replacement for a Matrix<> type. Here is an example: diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index f01b39aec..d801798c9 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox @@ -51,7 +51,7 @@ are doing. \section TopicPreprocessorDirectivesCppVersion C++ standard features -By default, %Eigen strive to automatically detect and enable langage features at compile-time based on +By default, %Eigen strive to automatically detect and enable language features at compile-time based on the information provided by the compiler. - \b EIGEN_MAX_CPP_VER - disables usage of C++ features requiring a version greater than EIGEN_MAX_CPP_VER. @@ -66,7 +66,7 @@ functions by defining EIGEN_HAS_C99_MATH=1. Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - \b EIGEN_HAS_CXX11_MATH - controls the implementation of some functions such as round, logp1, isinf, isnan, etc. Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - - \b EIGEN_HAS_RVALUE_REFERENCES - defines whetehr rvalue references are supported + - \b EIGEN_HAS_RVALUE_REFERENCES - defines whether rvalue references are supported Automatic detection disabled if EIGEN_MAX_CPP_VER<11. - \b EIGEN_HAS_STD_RESULT_OF - defines whether std::result_of is supported Automatic detection disabled if EIGEN_MAX_CPP_VER<11. @@ -120,6 +120,12 @@ run time. However, these assertions do cost time and can thus be turned off. - \b \c EIGEN_STACK_ALLOCATION_LIMIT - defines the maximum bytes for a buffer to be allocated on the stack. For internal temporary buffers, dynamic memory allocation is employed as a fall back. For fixed-size matrices or arrays, exceeding this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. + - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only, + and never called from device code. + - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline. + By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to + define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare + cases for which MSVC inliner fails to do a good job. - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default. diff --git a/doc/QuickReference.dox b/doc/QuickReference.dox index 44f5410db..18c90a2a9 100644 --- a/doc/QuickReference.dox +++ b/doc/QuickReference.dox @@ -68,7 +68,7 @@ Array <=> Array4f Conversion between the matrix and array worlds: \code -Array44f a1, a1; +Array44f a1, a2; Matrix4f m1, m2; m1 = a1 * a2; // coeffwise product, implicit conversion from array to matrix. a1 = m1 * m2; // matrix product, implicit conversion from matrix to array. @@ -261,6 +261,8 @@ x.setIdentity(); Vector3f::UnitX() // 1 0 0 Vector3f::UnitY() // 0 1 0 Vector3f::UnitZ() // 0 0 1 +Vector4f::Unit(i) +x.setUnit(i); \endcode
Example:Output:
@@ -278,6 +280,7 @@ N/A VectorXf::Unit(size,i) +x.setUnit(size,i); VectorXf::Unit(4,1) == Vector4f(0,1,0,0) == Vector4f::UnitY() \endcode @@ -285,7 +288,12 @@ VectorXf::Unit(4,1) == Vector4f(0,1,0,0)
- +Note that it is allowed to call any of the \c set* functions to a dynamic-sized vector or matrix without passing new sizes. +For instance: +\code +MatrixXi M(3,3); +M.setIdentity(); +\endcode \subsection QuickRef_Map Mapping external arrays diff --git a/doc/QuickStartGuide.dox b/doc/QuickStartGuide.dox index ea32c3b3d..23bb2981b 100644 --- a/doc/QuickStartGuide.dox +++ b/doc/QuickStartGuide.dox @@ -68,7 +68,7 @@ The output is as follows: The second example starts by declaring a 3-by-3 matrix \c m which is initialized using the \link DenseBase::Random(Index,Index) Random() \endlink method with random values between -1 and 1. The next line applies a linear mapping such that the values are between 10 and 110. The function call \link DenseBase::Constant(Index,Index,const Scalar&) MatrixXd::Constant\endlink(3,3,1.2) returns a 3-by-3 matrix expression having all coefficients equal to 1.2. The rest is standard arithmetics. -The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left unitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: +The next line of the \c main function introduces a new type: \c VectorXd. This represents a (column) vector of arbitrary size. Here, the vector \c v is created to contain \c 3 coefficients which are left uninitialized. The one but last line uses the so-called comma-initializer, explained in \ref TutorialAdvancedInitialization, to set all coefficients of the vector \c v to be as follows: \f[ v = diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index fc33b93e7..38754e4af 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -70,6 +70,9 @@ They are summarized in the following tables: UmfPackLU\link UmfPackSupport_Module UmfPackSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuiteSparse package, \b GPL +KLU\link KLUSupport_Module KLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, suitted for circuit simulation + Requires the SuiteSparse package, \b GPL + SuperLU\link SuperLUSupport_Module SuperLUSupport \endlinkDirect LU factorizationSquareFill-in reducing, Leverage fast dense algebra Requires the SuperLU library, (BSD-like) diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index a25622e80..81a73eec2 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -80,7 +80,7 @@ sm1.setZero(); \section SparseBasicInfos Matrix properties -Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some informations from the matrix. +Beyond the basic functions rows() and cols(), there are some useful functions that are available to easily get some information from the matrix. + + + + + + + + + + + + @@ -248,7 +260,7 @@ To get an overview of the true relative speed of the different decomposition, ch
Blocking
Means the algorithm can work per block, whence guaranteeing a good scaling of the performance for large matrices.
Implicit Multi Threading (MT)
-
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product rountines.
+
Means the algorithm can take advantage of multicore processors via OpenMP. "Implicit" means the algortihm itself is not parallelized, but that it relies on parallelized matrix-matrix product routines.
Explicit Multi Threading (MT)
Means the algorithm is explicitly parallelized to take advantage of multicore processors via OpenMP.
Meta-unroller
diff --git a/doc/TopicMultithreading.dox b/doc/TopicMultithreading.dox index 47c9b261f..bc394f484 100644 --- a/doc/TopicMultithreading.dox +++ b/doc/TopicMultithreading.dox @@ -47,7 +47,7 @@ int main(int argc, char** argv) \warning note that all functions generating random matrices are \b not re-entrant nor thread-safe. Those include DenseBase::Random(), and DenseBase::setRandom() despite a call to Eigen::initParallel(). This is because these functions are based on std::rand which is not re-entrant. For thread-safe random generator, we recommend the use of boost::random or c++11 random feature. -In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallization as detailed in the previous section. +In the case your application is parallelized with OpenMP, you might want to disable Eigen's own parallelization as detailed in the previous section. */ diff --git a/doc/TutorialGeometry.dox b/doc/TutorialGeometry.dox index 2e1420f98..86e9f1e72 100644 --- a/doc/TutorialGeometry.dox +++ b/doc/TutorialGeometry.dox @@ -111,7 +111,7 @@ rot3 = rot1.slerp(alpha,rot2);\endcodetop\section TutorialGeoTransform Affine transformations -Generic affine transformations are represented by the Transform class which internaly +Generic affine transformations are represented by the Transform class which internally is a (Dim+1)^2 matrix. In Eigen we have chosen to not distinghish between points and vectors such that all points are actually represented by displacement vectors from the origin ( \f$ \mathbf{p} \equiv \mathbf{p}-0 \f$ ). With that in mind, real points and diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index cb92ceeae..a72724143 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -73,7 +73,7 @@ depending on your matrix and the trade-off you want to make: - + @@ -85,6 +85,14 @@ depending on your matrix and the trade-off you want to make: + + + + + + + + @@ -101,15 +109,24 @@ depending on your matrix and the trade-off you want to make: + + + + + + + + - +
\code diff --git a/doc/TemplateKeyword.dox b/doc/TemplateKeyword.dox index b84cfdae9..fbf2c7081 100644 --- a/doc/TemplateKeyword.dox +++ b/doc/TemplateKeyword.dox @@ -76,7 +76,7 @@ point where the template is defined, without knowing the actual value of the tem and \c Derived2 in the example). That means that the compiler cannot know that dst.triangularView is a member template and that the following < symbol is part of the delimiter for the template parameter. Another possibility would be that dst.triangularView is a member variable with the < -symbol refering to the operator<() function. In fact, the compiler should choose the second +symbol referring to the operator<() function. In fact, the compiler should choose the second possibility, according to the standard. If dst.triangularView is a member template (as in our case), the programmer should specify this explicitly with the \c template keyword and write dst.template triangularView. diff --git a/doc/TopicLazyEvaluation.dox b/doc/TopicLazyEvaluation.dox index 101ef8c72..b7820e3e6 100644 --- a/doc/TopicLazyEvaluation.dox +++ b/doc/TopicLazyEvaluation.dox @@ -58,7 +58,7 @@ the product matrix3 * matrix4 gets evaluated immediately into a tempora \code matrix1 = matrix2 * (matrix3 + matrix4); \endcode -Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum everytime, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. +Here, provided the matrices have at least 2 rows and 2 columns, each coefficienct of the expression matrix3 + matrix4 is going to be used several times in the matrix product. Instead of computing the sum every time, it is much better to compute it once and store it in a temporary variable. Eigen understands this and evaluates matrix3 + matrix4 into a temporary variable before evaluating the product. */ diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 491470627..0965da872 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -4,7 +4,7 @@ namespace Eigen { This page presents a catalogue of the dense matrix decompositions offered by Eigen. For an introduction on linear solvers and decompositions, check this \link TutorialLinearAlgebra page \endlink. -To get an overview of the true relative speed of the different decomposition, check this \link DenseDecompositionBenchmark benchmark \endlink. +To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. \section TopicLinAlgBigTable Catalogue of decompositions offered by Eigen @@ -113,6 +113,18 @@ To get an overview of the true relative speed of the different decomposition, ch
\n Singular values and eigenvalues decompositions
BDCSVD (divide \& conquer)-One of the fastest SVD algorithmsExcellentYesSingular values/vectors, least squaresYes (and does least squares)ExcellentBlocked bidiagonalization
JacobiSVD (two-sided) -
ColPivHouseholderQR colPivHouseholderQr() None+++ - +++
- - +++
CompleteOrthogonalDecompositioncompleteOrthogonalDecomposition()None+-+++
LLT llt()+ ++
BDCSVDbdcSvd()None--+++
JacobiSVD jacobiSvd() None- -- - - - +++
+To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. All of these decompositions offer a solve() method that works as in the above example. @@ -183,8 +200,11 @@ Here is an example: \section TutorialLinAlgLeastsquares Least squares solving -The most accurate method to do least squares solving is with a SVD decomposition. Eigen provides one -as the JacobiSVD class, and its solve() is doing least-squares solving. +The most accurate method to do least squares solving is with a SVD decomposition. +Eigen provides two implementations. +The recommended one is the BDCSVD class, which scale well for large problems +and automatically fall-back to the JacobiSVD class for smaller problems. +For both classes, their solve() method is doing least-squares solving. Here is an example: diff --git a/doc/TutorialMapClass.dox b/doc/TutorialMapClass.dox index f8fb0fd2f..caa2539d8 100644 --- a/doc/TutorialMapClass.dox +++ b/doc/TutorialMapClass.dox @@ -29,9 +29,9 @@ Map mi(pi); \endcode where \c pi is an \c int \c *. In this case the size does not have to be passed to the constructor, because it is already specified by the Matrix/Array type. -Note that Map does not have a default constructor; you \em must pass a pointer to intialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). +Note that Map does not have a default constructor; you \em must pass a pointer to initialize the object. However, you can work around this requirement (see \ref TutorialMapPlacementNew). -Map is flexible enough to accomodate a variety of different data representations. There are two other (optional) template parameters: +Map is flexible enough to accommodate a variety of different data representations. There are two other (optional) template parameters: \code Map mat(rows,cols); // default is column major 2: mat.reserve(VectorXi::Constant(cols,6)); diff --git a/doc/UnalignedArrayAssert.dox b/doc/UnalignedArrayAssert.dox index 0f7022973..8676faa1b 100644 --- a/doc/UnalignedArrayAssert.dox +++ b/doc/UnalignedArrayAssert.dox @@ -117,8 +117,8 @@ It doesn't disable 16-byte alignment, because that would mean that vectorized an \section checkmycode How can I check my code is safe regarding alignment issues? -Unfortunately, there is no possibility in C++ to detect any of the aformentioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). -Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the begining of this page. +Unfortunately, there is no possibility in C++ to detect any of the aforementioned shortcoming at compile time (though static analysers are becoming more and more powerful and could detect some of them). +Even at runtime, all we can do is to catch invalid unaligned allocation and trigger the explicit assertion mentioned at the beginning of this page. Therefore, if your program runs fine on a given system with some given compilation flags, then this does not guarantee that your code is safe. For instance, on most 64 bits systems buffer are aligned on 16 bytes boundary and so, if you do not enable AVX instruction set, then your code will run fine. On the other hand, the same code may assert if moving to a more exotic platform, or enabling AVX instructions that required 32 bytes alignment by default. The situation is not hopeless though. Assuming your code is well covered by unit test, then you can check its alignment safety by linking it to a custom malloc library returning 8 bytes aligned buffers only. This way all alignment shortcomings should pop-up. To this end, you must also compile your program with \link TopicPreprocessorDirectivesPerformance EIGEN_MALLOC_ALREADY_ALIGNED=0 \endlink. diff --git a/doc/UsingIntelMKL.dox b/doc/UsingIntelMKL.dox index a1a3a18f2..fc35c3cf0 100644 --- a/doc/UsingIntelMKL.dox +++ b/doc/UsingIntelMKL.dox @@ -63,6 +63,12 @@ In addition you can choose which parts will be substituted by defining one or mu
\c EIGEN_USE_MKL_ALL Defines \c EIGEN_USE_BLAS, \c EIGEN_USE_LAPACKE, and \c EIGEN_USE_MKL_VML
+The \c EIGEN_USE_BLAS and \c EIGEN_USE_LAPACKE* macros can be combined with \c EIGEN_USE_MKL to explicitly tell Eigen that the underlying BLAS/Lapack implementation is Intel MKL. +The main effect is to enable MKL direct call feature (\c MKL_DIRECT_CALL). +This may help to increase performance of some MKL BLAS (?GEMM, ?GEMV, ?TRSM, ?AXPY and ?DOT) and LAPACK (LU, Cholesky and QR) routines for very small matrices. +MKL direct call can be disabled by defining \c EIGEN_MKL_NO_DIRECT_CALL. + + Note that the BLAS and LAPACKE backends can be enabled for any F77 compatible BLAS and LAPACK libraries. See this \link TopicUsingBlasLapack page \endlink for the details. Finally, the PARDISO sparse solver shipped with Intel MKL can be used through the \ref PardisoLU, \ref PardisoLLT and \ref PardisoLDLT classes of the \ref PardisoSupport_Module. diff --git a/doc/UsingNVCC.dox b/doc/UsingNVCC.dox index f8e755b79..36beb2ddd 100644 --- a/doc/UsingNVCC.dox +++ b/doc/UsingNVCC.dox @@ -3,18 +3,16 @@ namespace Eigen { /** \page TopicCUDA Using Eigen in CUDA kernels -\b Disclaimer: this page is about an \b experimental feature in %Eigen. - -Staring from CUDA 5.0, the CUDA compiler, \c nvcc, is able to properly parse %Eigen's code (almost). -A few adaptations of the %Eigen's code already allows to use some parts of %Eigen in your own CUDA kernels. -To this end you need the devel branch of %Eigen, CUDA 5.0 or greater with GCC. +Staring from CUDA 5.5 and Eigen 3.3, it is possible to use Eigen's matrices, vectors, and arrays for fixed size within CUDA kernels. This is especially useful when working on numerous but small problems. By default, when Eigen's headers are included within a .cu file compiled by nvcc most Eigen's functions and methods are prefixed by the \c __device__ \c __host__ keywords making them callable from both host and device code. +This support can be disabled by defining \c EIGEN_NO_CUDA before including any Eigen's header. +This might be useful to disable some warnings when a .cu file makes use of Eigen on the host side only. +However, in both cases, host's SIMD vectorization has to be disabled in .cu files. +It is thus \b strongly \b recommended to properly move all costly host computation from your .cu files to regular .cpp files. Known issues: - \c nvcc with MS Visual Studio does not work (patch welcome) - - \c nvcc with \c clang does not work (patch welcome) - - \c nvcc 5.5 with gcc-4.7 (or greater) has issues with the standard \c \ header file. To workaround this, you can add the following before including any other files: \code // workaround issue between gcc >= 4.7 and cuda 5.5 diff --git a/doc/eigen_navtree_hacks.js b/doc/eigen_navtree_hacks.js index bd7e02b38..39c59f73c 100644 --- a/doc/eigen_navtree_hacks.js +++ b/doc/eigen_navtree_hacks.js @@ -65,6 +65,10 @@ function getNode(o, po) function resizeHeight() { var toc = $("#nav-toc"); + var header = $("#header"); + var content = $("#doc-content"); + var navtree = $("#nav-path"); + var sidenav = $("#side-nav"); var tocHeight = toc.height(); // <- we added this line var headerHeight = header.height(); var footerHeight = footer.height(); diff --git a/doc/eigendoxy.css b/doc/eigendoxy.css index 6ce2b839b..427b128ba 100644 --- a/doc/eigendoxy.css +++ b/doc/eigendoxy.css @@ -93,7 +93,7 @@ table th.inter { border-color: #cccccc; } -/** class for exemple / output tables **/ +/** class for example / output tables **/ table.example { } @@ -219,3 +219,8 @@ h3.version { td.width20em p.endtd { width: 20em; } + +/* needed for huge screens */ +.ui-resizable-e { + background-repeat: repeat-y; +} \ No newline at end of file diff --git a/doc/eigendoxy_footer.html.in b/doc/eigendoxy_footer.html.in index 878244a19..9ac0596cb 100644 --- a/doc/eigendoxy_footer.html.in +++ b/doc/eigendoxy_footer.html.in @@ -5,14 +5,14 @@ $navpath + doxygen $doxygenversion diff --git a/doc/eigendoxy_header.html.in b/doc/eigendoxy_header.html.in index 0f3859f40..bb149f8f0 100644 --- a/doc/eigendoxy_header.html.in +++ b/doc/eigendoxy_header.html.in @@ -4,25 +4,23 @@ + $projectname: $title $title - - - + + + $treeview $search $mathjax - + - -
-
@@ -30,10 +28,10 @@ $mathjax - Logo + Logo - +
$projectname  $projectnumber
@@ -42,7 +40,7 @@ $mathjax - +
$projectbrief
diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp index f1c4f503e..6bfaccbce 100644 --- a/doc/examples/Cwise_lgamma.cpp +++ b/doc/examples/Cwise_lgamma.cpp @@ -6,4 +6,4 @@ int main() { Array4d v(0.5,10,0,-1); std::cout << v.lgamma() << std::endl; -} \ No newline at end of file +} diff --git a/doc/examples/TutorialLinAlgSVDSolve.cpp b/doc/examples/TutorialLinAlgSVDSolve.cpp index 9fbc031de..f109f04e5 100644 --- a/doc/examples/TutorialLinAlgSVDSolve.cpp +++ b/doc/examples/TutorialLinAlgSVDSolve.cpp @@ -11,5 +11,5 @@ int main() VectorXf b = VectorXf::Random(3); cout << "Here is the right hand side b:\n" << b << endl; cout << "The least-squares solution is:\n" - << A.jacobiSvd(ComputeThinU | ComputeThinV).solve(b) << endl; + << A.bdcSvd(ComputeThinU | ComputeThinV).solve(b) << endl; } diff --git a/doc/examples/Tutorial_simple_example_dynamic_size.cpp b/doc/examples/Tutorial_simple_example_dynamic_size.cpp index 0f0280e0e..defcb1ee4 100644 --- a/doc/examples/Tutorial_simple_example_dynamic_size.cpp +++ b/doc/examples/Tutorial_simple_example_dynamic_size.cpp @@ -10,7 +10,7 @@ int main() MatrixXi m(size,size+1); // a (size)x(size+1)-matrix of int's for (int j=0; j CwiseNullaryOp, typename indexing_functor::MatrixType> -indexing(const Eigen::MatrixBase& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) +mat_indexing(const Eigen::MatrixBase& arg, const RowIndexType& row_indices, const ColIndexType& col_indices) { typedef indexing_functor Func; typedef typename Func::MatrixType MatrixType; @@ -45,7 +45,7 @@ int main() Eigen::MatrixXi A = Eigen::MatrixXi::Random(4,4); Array3i ri(1,2,1); ArrayXi ci(6); ci << 3,2,1,0,0,2; - Eigen::MatrixXi B = indexing(A, ri, ci); + Eigen::MatrixXi B = mat_indexing(A, ri, ci); std::cout << "A =" << std::endl; std::cout << A << std::endl << std::endl; std::cout << "A([" << ri.transpose() << "], [" << ci.transpose() << "]) =" << std::endl; @@ -53,11 +53,11 @@ int main() std::cout << "[main1]\n"; std::cout << "[main2]\n"; - B = indexing(A, ri+1, ci); + B = mat_indexing(A, ri+1, ci); std::cout << "A(ri+1,ci) =" << std::endl; std::cout << B << std::endl << std::endl; #if __cplusplus >= 201103L - B = indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)); + B = mat_indexing(A, ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)); std::cout << "A(ArrayXi::LinSpaced(13,0,12).unaryExpr([](int x){return x%4;}), ArrayXi::LinSpaced(4,0,3)) =" << std::endl; std::cout << B << std::endl << std::endl; #endif diff --git a/doc/snippets/DirectionWise_hnormalized.cpp b/doc/snippets/DirectionWise_hnormalized.cpp index 3410790a8..2451f6e7b 100644 --- a/doc/snippets/DirectionWise_hnormalized.cpp +++ b/doc/snippets/DirectionWise_hnormalized.cpp @@ -1,7 +1,6 @@ -typedef Matrix Matrix4Xd; Matrix4Xd M = Matrix4Xd::Random(4,5); Projective3d P(Matrix4d::Random()); cout << "The matrix M is:" << endl << M << endl << endl; cout << "M.colwise().hnormalized():" << endl << M.colwise().hnormalized() << endl << endl; cout << "P*M:" << endl << P*M << endl << endl; -cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl; \ No newline at end of file +cout << "(P*M).colwise().hnormalized():" << endl << (P*M).colwise().hnormalized() << endl << endl; diff --git a/doc/snippets/MatrixBase_cwiseEqual.cpp b/doc/snippets/MatrixBase_cwiseEqual.cpp index eb3656f4c..469af642c 100644 --- a/doc/snippets/MatrixBase_cwiseEqual.cpp +++ b/doc/snippets/MatrixBase_cwiseEqual.cpp @@ -3,5 +3,5 @@ m << 1, 0, 1, 1; cout << "Comparing m with identity matrix:" << endl; cout << m.cwiseEqual(MatrixXi::Identity(2,2)) << endl; -int count = m.cwiseEqual(MatrixXi::Identity(2,2)).count(); +Index count = m.cwiseEqual(MatrixXi::Identity(2,2)).count(); cout << "Number of coefficients that are equal: " << count << endl; diff --git a/doc/snippets/MatrixBase_cwiseNotEqual.cpp b/doc/snippets/MatrixBase_cwiseNotEqual.cpp index 6a2e4fb6c..7f0a105d6 100644 --- a/doc/snippets/MatrixBase_cwiseNotEqual.cpp +++ b/doc/snippets/MatrixBase_cwiseNotEqual.cpp @@ -3,5 +3,5 @@ m << 1, 0, 1, 1; cout << "Comparing m with identity matrix:" << endl; cout << m.cwiseNotEqual(MatrixXi::Identity(2,2)) << endl; -int count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count(); +Index count = m.cwiseNotEqual(MatrixXi::Identity(2,2)).count(); cout << "Number of coefficients that are not equal: " << count << endl; diff --git a/doc/snippets/MatrixBase_reshaped_all.cpp b/doc/snippets/MatrixBase_reshaped_all.cpp index 501f6276f..a4841834f 100644 --- a/doc/snippets/MatrixBase_reshaped_all.cpp +++ b/doc/snippets/MatrixBase_reshaped_all.cpp @@ -1,4 +1,3 @@ -using Eigen::placeholders::all; Matrix4i m = Matrix4i::Random(); cout << "Here is the matrix m:" << endl << m << endl; cout << "Here is m(all).transpose():" << endl << m(all).transpose() << endl; diff --git a/doc/snippets/Matrix_Map_stride.cpp b/doc/snippets/Matrix_Map_stride.cpp new file mode 100644 index 000000000..ae42a127a --- /dev/null +++ b/doc/snippets/Matrix_Map_stride.cpp @@ -0,0 +1,7 @@ +Matrix4i A; +A << 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16; + +std::cout << Matrix2i::Map(&A(1,1),Stride<8,2>()) << std::endl; diff --git a/doc/snippets/VectorwiseOp_homogeneous.cpp b/doc/snippets/VectorwiseOp_homogeneous.cpp index aba4fed0e..67cf5737d 100644 --- a/doc/snippets/VectorwiseOp_homogeneous.cpp +++ b/doc/snippets/VectorwiseOp_homogeneous.cpp @@ -1,7 +1,6 @@ -typedef Matrix Matrix3Xd; Matrix3Xd M = Matrix3Xd::Random(3,5); Projective3d P(Matrix4d::Random()); cout << "The matrix M is:" << endl << M << endl << endl; cout << "M.colwise().homogeneous():" << endl << M.colwise().homogeneous() << endl << endl; cout << "P * M.colwise().homogeneous():" << endl << P * M.colwise().homogeneous() << endl << endl; -cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl; \ No newline at end of file +cout << "P * M.colwise().homogeneous().hnormalized(): " << endl << (P * M.colwise().homogeneous()).colwise().hnormalized() << endl << endl; diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp index 830e196ea..8850db052 100644 --- a/doc/special_examples/Tutorial_sparse_example.cpp +++ b/doc/special_examples/Tutorial_sparse_example.cpp @@ -1,5 +1,6 @@ #include #include +#include typedef Eigen::SparseMatrix SpMat; // declares a column-major sparse matrix type of double typedef Eigen::Triplet T; @@ -9,10 +10,13 @@ void saveAsBitmap(const Eigen::VectorXd& x, int n, const char* filename); int main(int argc, char** argv) { - assert(argc==2); + if(argc!=2) { + std::cerr << "Error: expected one and only one argument.\n"; + return -1; + } int n = 300; // size of the image - int m = n*n; // number of unknows (=number of pixels) + int m = n*n; // number of unknowns (=number of pixels) // Assembly: std::vector coefficients; // list of non-zeros coefficients diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 9883d4c72..522ba8a2b 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -35,7 +35,7 @@ set(EigenLapack_SRCS ${EigenLapack_SRCS} second_NONE.f dsecnd_NONE.f ) -option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enbale the Lapack unit tests") +option(EIGEN_ENABLE_LAPACK_TESTS OFF "Enable the Lapack unit tests") if(EIGEN_ENABLE_LAPACK_TESTS) @@ -49,7 +49,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS) INACTIVITY_TIMEOUT 15 TIMEOUT 240 STATUS download_status - EXPECTED_MD5 5758ce55afcf79da98de8b9de1615ad5 + EXPECTED_MD5 ab5742640617e3221a873aba44bbdc93 SHOW_PROGRESS) message(STATUS ${download_status}) @@ -59,7 +59,7 @@ if(EIGEN_ENABLE_LAPACK_TESTS) message(STATUS "Setup lapack reference and lapack unit tests") execute_process(COMMAND tar xzf "lapack_addons_3.4.1.tgz" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) else() - message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests wont be enabled") + message(STATUS "Download of lapack_addons_3.4.1.tgz failed, LAPACK unit tests won't be enabled") set(EIGEN_ENABLE_LAPACK_TESTS false) endif() diff --git a/scripts/buildtests.in b/scripts/buildtests.in index 526d5b74b..ab9c18fb1 100755 --- a/scripts/buildtests.in +++ b/scripts/buildtests.in @@ -10,7 +10,7 @@ then fi TESTSLIST="@EIGEN_TESTS_LIST@" -targets_to_make=`echo "$TESTSLIST" | egrep "$1" | xargs echo` +targets_to_make=$(echo "$TESTSLIST" | grep -E "$1" | xargs echo) if [ -n "${EIGEN_MAKE_ARGS:+x}" ] then diff --git a/scripts/eigen_gen_split_test_help.cmake b/scripts/eigen_gen_split_test_help.cmake new file mode 100644 index 000000000..e43f5aabe --- /dev/null +++ b/scripts/eigen_gen_split_test_help.cmake @@ -0,0 +1,11 @@ +#!cmake -P +file(WRITE split_test_helper.h "") +foreach(i RANGE 1 999) + file(APPEND split_test_helper.h + "#if defined(EIGEN_TEST_PART_${i}) || defined(EIGEN_TEST_PART_ALL)\n" + "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" + "#else\n" + "#define CALL_SUBTEST_${i}(FUNC)\n" + "#endif\n\n" + ) +endforeach() \ No newline at end of file diff --git a/scripts/eigen_monitor_perf.sh b/scripts/eigen_monitor_perf.sh index 39f8e7ecd..8f3425daf 100755 --- a/scripts/eigen_monitor_perf.sh +++ b/scripts/eigen_monitor_perf.sh @@ -12,9 +12,9 @@ export CXX_FLAGS="-mfma -w" #### BENCH_PATH=$EIGEN_SOURCE_PATH/bench/perf_monitoring/$PREFIX -PREVPATH=`pwd` -cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" $* -cd $PREVPATH +PREVPATH=$(pwd) +cd $EIGEN_SOURCE_PATH/bench/perf_monitoring && ./runall.sh "Haswell 2.6GHz, FMA, Apple's clang" "$@" +cd $PREVPATH || exit 1 ALLFILES="$BENCH_PATH/*.png $BENCH_PATH/*.html $BENCH_PATH/index.html $BENCH_PATH/s1.js $BENCH_PATH/s2.js" diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h new file mode 100644 index 000000000..2b6544a6a --- /dev/null +++ b/test/AnnoyingScalar.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011-2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TEST_ANNOYING_SCALAR_H +#define EIGEN_TEST_ANNOYING_SCALAR_H + +#include + +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW +struct my_exception +{ + my_exception() {} + ~my_exception() {} +}; +#endif + +// An AnnoyingScalar is a pseudo scalar type that: +// - can randomly through an exception in operator + +// - randomly allocate on the heap or initialize a reference to itself making it non trivially copyable, nor movable, nor relocatable. + +class AnnoyingScalar +{ + public: + AnnoyingScalar() { init(); *v = 0; } + AnnoyingScalar(long double _v) { init(); *v = _v; } + AnnoyingScalar(double _v) { init(); *v = _v; } + AnnoyingScalar(float _v) { init(); *v = _v; } + AnnoyingScalar(int _v) { init(); *v = _v; } + AnnoyingScalar(long _v) { init(); *v = _v; } + #if EIGEN_HAS_CXX11 + AnnoyingScalar(long long _v) { init(); *v = _v; } + #endif + AnnoyingScalar(const AnnoyingScalar& other) { init(); *v = *(other.v); } + ~AnnoyingScalar() { + if(v!=&data) + delete v; + instances--; + } + + void init() { + if(internal::random()) + v = new float; + else + v = &data; + instances++; + } + + AnnoyingScalar operator+(const AnnoyingScalar& other) const + { + #ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW + countdown--; + if(countdown<=0 && !dont_throw) + throw my_exception(); + #endif + return AnnoyingScalar(*v+*other.v); + } + + AnnoyingScalar operator-() const + { return AnnoyingScalar(-*v); } + + AnnoyingScalar operator-(const AnnoyingScalar& other) const + { return AnnoyingScalar(*v-*other.v); } + + AnnoyingScalar operator*(const AnnoyingScalar& other) const + { return AnnoyingScalar((*v)*(*other.v)); } + + AnnoyingScalar operator/(const AnnoyingScalar& other) const + { return AnnoyingScalar((*v)/(*other.v)); } + + AnnoyingScalar& operator+=(const AnnoyingScalar& other) { *v += *other.v; return *this; } + AnnoyingScalar& operator-=(const AnnoyingScalar& other) { *v -= *other.v; return *this; } + AnnoyingScalar& operator*=(const AnnoyingScalar& other) { *v *= *other.v; return *this; } + AnnoyingScalar& operator/=(const AnnoyingScalar& other) { *v /= *other.v; return *this; } + AnnoyingScalar& operator= (const AnnoyingScalar& other) { *v = *other.v; return *this; } + + bool operator==(const AnnoyingScalar& other) const { return *v == *other.v; } + bool operator!=(const AnnoyingScalar& other) const { return *v != *other.v; } + bool operator<=(const AnnoyingScalar& other) const { return *v <= *other.v; } + bool operator< (const AnnoyingScalar& other) const { return *v < *other.v; } + bool operator>=(const AnnoyingScalar& other) const { return *v >= *other.v; } + bool operator> (const AnnoyingScalar& other) const { return *v > *other.v; } + + float* v; + float data; + static int instances; +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW + static int countdown; + static bool dont_throw; +#endif +}; + +AnnoyingScalar real(const AnnoyingScalar &x) { return x; } +AnnoyingScalar imag(const AnnoyingScalar & ) { return 0; } +AnnoyingScalar conj(const AnnoyingScalar &x) { return x; } +AnnoyingScalar sqrt(const AnnoyingScalar &x) { return std::sqrt(*x.v); } +AnnoyingScalar abs (const AnnoyingScalar &x) { return std::abs(*x.v); } +AnnoyingScalar cos (const AnnoyingScalar &x) { return std::cos(*x.v); } +AnnoyingScalar sin (const AnnoyingScalar &x) { return std::sin(*x.v); } +AnnoyingScalar acos(const AnnoyingScalar &x) { return std::acos(*x.v); } +AnnoyingScalar atan2(const AnnoyingScalar &y,const AnnoyingScalar &x) { return std::atan2(*y.v,*x.v); } + +std::ostream& operator<<(std::ostream& stream,const AnnoyingScalar& x) { + stream << (*(x.v)); + return stream; +} + +int AnnoyingScalar::instances = 0; + +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW +int AnnoyingScalar::countdown = 0; +bool AnnoyingScalar::dont_throw = false; +#endif + +namespace Eigen { +template<> +struct NumTraits : NumTraits +{ + enum { + RequireInitialization = true + }; + typedef AnnoyingScalar Real; + typedef AnnoyingScalar Nested; + typedef AnnoyingScalar Literal; + typedef AnnoyingScalar NonInteger; +}; + +template<> inline AnnoyingScalar test_precision() { return test_precision(); } + +namespace internal { + template<> double cast(const AnnoyingScalar& x) { return double(*x.v); } + template<> float cast(const AnnoyingScalar& x) { return *x.v; } +} + +} + +AnnoyingScalar get_test_precision(const AnnoyingScalar&) +{ return Eigen::test_precision(); } + +AnnoyingScalar test_relative_error(const AnnoyingScalar &a, const AnnoyingScalar &b) +{ return test_relative_error(*a.v, *b.v); } + +inline bool test_isApprox(const AnnoyingScalar &a, const AnnoyingScalar &b) +{ return internal::isApprox(*a.v, *b.v, test_precision()); } + +inline bool test_isMuchSmallerThan(const AnnoyingScalar &a, const AnnoyingScalar &b) +{ return test_isMuchSmallerThan(*a.v, *b.v); } + +#endif // EIGEN_TEST_ANNOYING_SCALAR_H diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ed5aed1c8..45e7abbd1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,16 +1,7 @@ -# generate split test header file only if it does not yet exist -# in order to prevent a rebuild everytime cmake is configured -if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) - file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") - foreach(i RANGE 1 999) - file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h - "#ifdef EIGEN_TEST_PART_${i}\n" - "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" - "#else\n" - "#define CALL_SUBTEST_${i}(FUNC)\n" - "#endif\n\n" - ) - endforeach() +# The file split_test_helper.h was generated at first run, +# it is now included in test/ +if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) endif() # check if we have a Fortran compiler @@ -27,9 +18,18 @@ endif() if(NOT EIGEN_Fortran_COMPILER_WORKS) # search for a default Lapack library to complete Eigen's one - find_package(LAPACK) + find_package(LAPACK QUIET) endif() +# TODO do the same for EXTERNAL_LAPACK +option(EIGEN_TEST_EXTERNAL_BLAS "Use external BLAS library for testsuite" OFF) +if(EIGEN_TEST_EXTERNAL_BLAS) + find_package(BLAS REQUIRED) + message(STATUS "BLAS_COMPILER_FLAGS: ${BLAS_COMPILER_FLAGS}") + add_definitions("-DEIGEN_USE_BLAS") # is adding ${BLAS_COMPILER_FLAGS} necessary? + list(APPEND EXTERNAL_LIBS "${BLAS_LIBRARIES}") +endif(EIGEN_TEST_EXTERNAL_BLAS) + # configure blas/lapack (use Eigen's ones) set(EIGEN_BLAS_LIBRARIES eigen_blas) set(EIGEN_LAPACK_LIBRARIES eigen_lapack) @@ -68,6 +68,17 @@ else() ei_add_property(EIGEN_MISSING_BACKENDS "UmfPack, ") endif() +find_package(KLU) +if(KLU_FOUND) + add_definitions("-DEIGEN_KLU_SUPPORT") + include_directories(${KLU_INCLUDES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") +else() + ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") +endif() + find_package(SuperLU 4.0) if(SUPERLU_FOUND) add_definitions("-DEIGEN_SUPERLU_SUPPORT") @@ -80,23 +91,30 @@ else() endif() -find_package(Pastix) -find_package(Scotch) -find_package(Metis 5.0 REQUIRED) -if(PASTIX_FOUND) +find_package(PASTIX QUIET COMPONENTS METIS SEQ) +# check that the PASTIX found is a version without MPI +find_path(PASTIX_pastix_nompi.h_INCLUDE_DIRS + NAMES pastix_nompi.h + HINTS ${PASTIX_INCLUDE_DIRS} +) +if (NOT PASTIX_pastix_nompi.h_INCLUDE_DIRS) + message(STATUS "A version of Pastix has been found but pastix_nompi.h does not exist in the include directory." + " Because Eigen tests require a version without MPI, we disable the Pastix backend.") +endif() +if(PASTIX_FOUND AND PASTIX_pastix_nompi.h_INCLUDE_DIRS) add_definitions("-DEIGEN_PASTIX_SUPPORT") - include_directories(${PASTIX_INCLUDES}) + include_directories(${PASTIX_INCLUDE_DIRS_DEP}) if(SCOTCH_FOUND) - include_directories(${SCOTCH_INCLUDES}) + include_directories(${SCOTCH_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${SCOTCH_LIBRARIES}) elseif(METIS_FOUND) - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) set(PASTIX_LIBRARIES ${PASTIX_LIBRARIES} ${METIS_LIBRARIES}) else(SCOTCH_FOUND) ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") endif(SCOTCH_FOUND) - set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES} ${ORDERING_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) - set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) + set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) + set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) ei_add_property(EIGEN_TESTED_BACKENDS "PaStiX, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "PaStiX, ") @@ -104,7 +122,7 @@ endif() if(METIS_FOUND) add_definitions("-DEIGEN_METIS_SUPPORT") - include_directories(${METIS_INCLUDES}) + include_directories(${METIS_INCLUDE_DIRS}) ei_add_property(EIGEN_TESTED_BACKENDS "METIS, ") else() ei_add_property(EIGEN_MISSING_BACKENDS "METIS, ") @@ -141,6 +159,7 @@ add_custom_target(BuildOfficial) ei_add_test(rand) ei_add_test(meta) +ei_add_test(numext) ei_add_test(sizeof) ei_add_test(dynalloc) ei_add_test(nomalloc) @@ -180,7 +199,7 @@ ei_add_test(smallvectors) ei_add_test(mapped_matrix) ei_add_test(mapstride) ei_add_test(mapstaticmethods) -ei_add_test(array) +ei_add_test(array_cwise) ei_add_test(array_for_matrix) ei_add_test(array_replicate) ei_add_test(array_reverse) @@ -265,6 +284,7 @@ ei_add_test(mpl2only) ei_add_test(inplace_decomposition) ei_add_test(half_float) ei_add_test(array_of_string) +ei_add_test(num_dimensions) add_executable(bug1213 bug1213.cpp bug1213_main.cpp) @@ -290,6 +310,10 @@ if(UMFPACK_FOUND) ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") endif() +if(KLU_FOUND OR SuiteSparse_FOUND) + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") +endif() + if(SUPERLU_FOUND) ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") endif() @@ -364,10 +388,9 @@ if(CUDA_FOUND) if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_30") endif() - cuda_include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(cuda_basic) + ei_add_test(gpu_basic) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) @@ -376,6 +399,47 @@ endif(CUDA_FOUND) endif(EIGEN_TEST_CUDA) +# HIP unit tests +option(EIGEN_TEST_HIP "Add HIP support." OFF) +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if (${HIP_PLATFORM} STREQUAL "hcc") + + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + ei_add_test(gpu_basic) + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif (${HIP_PLATFORM} STREQUAL "nvcc") + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif(HIP_FOUND) + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif(EIGEN_TEST_HIP) + + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests) add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON) diff --git a/test/adjoint.cpp b/test/adjoint.cpp index bdea51c10..4e1e4b5e8 100644 --- a/test/adjoint.cpp +++ b/test/adjoint.cpp @@ -70,7 +70,6 @@ template void adjoint(const MatrixType& m) Transpose.h Conjugate.h Dot.h */ using std::abs; - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef Matrix VectorType; @@ -146,7 +145,35 @@ template void adjoint(const MatrixType& m) VERIFY_IS_APPROX(rv1.template cast().dot(v1), rv1.dot(v1)); } -void test_adjoint() +template +void adjoint_extra() +{ + MatrixXcf a(10,10), b(10,10); + VERIFY_RAISES_ASSERT(a = a.transpose()); + VERIFY_RAISES_ASSERT(a = a.transpose() + b); + VERIFY_RAISES_ASSERT(a = b + a.transpose()); + VERIFY_RAISES_ASSERT(a = a.conjugate().transpose()); + VERIFY_RAISES_ASSERT(a = a.adjoint()); + VERIFY_RAISES_ASSERT(a = a.adjoint() + b); + VERIFY_RAISES_ASSERT(a = b + a.adjoint()); + + // no assertion should be triggered for these cases: + a.transpose() = a.transpose(); + a.transpose() += a.transpose(); + a.transpose() += a.transpose() + b; + a.transpose() = a.adjoint(); + a.transpose() += a.adjoint(); + a.transpose() += a.adjoint() + b; + + // regression tests for check_for_aliasing + MatrixXd c(10,10); + c = 1.0 * MatrixXd::Ones(10,10) + c; + c = MatrixXd::Ones(10,10) * 1.0 + c; + c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) ); + c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10); +} + +EIGEN_DECLARE_TEST(adjoint) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( adjoint(Matrix()) ); @@ -169,32 +196,6 @@ void test_adjoint() // test a large static matrix only once CALL_SUBTEST_7( adjoint(Matrix()) ); -#ifdef EIGEN_TEST_PART_13 - { - MatrixXcf a(10,10), b(10,10); - VERIFY_RAISES_ASSERT(a = a.transpose()); - VERIFY_RAISES_ASSERT(a = a.transpose() + b); - VERIFY_RAISES_ASSERT(a = b + a.transpose()); - VERIFY_RAISES_ASSERT(a = a.conjugate().transpose()); - VERIFY_RAISES_ASSERT(a = a.adjoint()); - VERIFY_RAISES_ASSERT(a = a.adjoint() + b); - VERIFY_RAISES_ASSERT(a = b + a.adjoint()); - - // no assertion should be triggered for these cases: - a.transpose() = a.transpose(); - a.transpose() += a.transpose(); - a.transpose() += a.transpose() + b; - a.transpose() = a.adjoint(); - a.transpose() += a.adjoint(); - a.transpose() += a.adjoint() + b; - - // regression tests for check_for_aliasing - MatrixXd c(10,10); - c = 1.0 * MatrixXd::Ones(10,10) + c; - c = MatrixXd::Ones(10,10) * 1.0 + c; - c = c + MatrixXd::Ones(10,10) .cwiseProduct( MatrixXd::Zero(10,10) ); - c = MatrixXd::Ones(10,10) * MatrixXd::Zero(10,10); - } -#endif + CALL_SUBTEST_13( adjoint_extra<0>() ); } diff --git a/test/array.cpp b/test/array_cwise.cpp similarity index 98% rename from test/array.cpp rename to test/array_cwise.cpp index f7f3ba780..84e46665b 100644 --- a/test/array.cpp +++ b/test/array_cwise.cpp @@ -11,7 +11,6 @@ template void array(const ArrayType& m) { - typedef typename ArrayType::Index Index; typedef typename ArrayType::Scalar Scalar; typedef typename ArrayType::RealScalar RealScalar; typedef Array ColVectorType; @@ -130,7 +129,6 @@ template void array(const ArrayType& m) template void comparisons(const ArrayType& m) { using std::abs; - typedef typename ArrayType::Index Index; typedef typename ArrayType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -197,7 +195,7 @@ template void comparisons(const ArrayType& m) RealScalar a = m1.abs().mean(); VERIFY( (m1<-a || m1>a).count() == (m1.abs()>a).count()); - typedef Array ArrayOfIndices; + typedef Array ArrayOfIndices; // TODO allows colwise/rowwise for array VERIFY_IS_APPROX(((m1.abs()+1)>RealScalar(0.1)).colwise().count(), ArrayOfIndices::Constant(cols,rows).transpose()); @@ -208,7 +206,6 @@ template void array_real(const ArrayType& m) { using std::abs; using std::sqrt; - typedef typename ArrayType::Index Index; typedef typename ArrayType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -234,6 +231,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); + VERIFY_IS_APPROX(m1.logistic(), logistic(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); VERIFY_IS_APPROX(m1.round(), round(m1)); @@ -269,6 +267,7 @@ template void array_real(const ArrayType& m) VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1))); VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1))); VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1)))); + VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0+exp(-m1)))); VERIFY_IS_APPROX(arg(m1), ((m1<0).template cast())*std::acos(-1.0)); VERIFY((round(m1) <= ceil(m1) && round(m1) >= floor(m1)).all()); VERIFY((Eigen::isnan)((m1*0.0)/0.0).all()); @@ -322,7 +321,6 @@ template void array_real(const ArrayType& m) template void array_complex(const ArrayType& m) { - typedef typename ArrayType::Index Index; typedef typename ArrayType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -349,6 +347,7 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX(m1.sinh(), sinh(m1)); VERIFY_IS_APPROX(m1.cosh(), cosh(m1)); VERIFY_IS_APPROX(m1.tanh(), tanh(m1)); + VERIFY_IS_APPROX(m1.logistic(), logistic(m1)); VERIFY_IS_APPROX(m1.arg(), arg(m1)); VERIFY((m1.isNaN() == (Eigen::isnan)(m1)).all()); VERIFY((m1.isInf() == (Eigen::isinf)(m1)).all()); @@ -372,6 +371,7 @@ template void array_complex(const ArrayType& m) VERIFY_IS_APPROX(sinh(m1), 0.5*(exp(m1)-exp(-m1))); VERIFY_IS_APPROX(cosh(m1), 0.5*(exp(m1)+exp(-m1))); VERIFY_IS_APPROX(tanh(m1), (0.5*(exp(m1)-exp(-m1)))/(0.5*(exp(m1)+exp(-m1)))); + VERIFY_IS_APPROX(logistic(m1), (1.0/(1.0 + exp(-m1)))); for (Index i = 0; i < m.rows(); ++i) for (Index j = 0; j < m.cols(); ++j) @@ -427,7 +427,6 @@ template void array_complex(const ArrayType& m) template void min_max(const ArrayType& m) { - typedef typename ArrayType::Index Index; typedef typename ArrayType::Scalar Scalar; Index rows = m.rows(); @@ -454,7 +453,7 @@ template void min_max(const ArrayType& m) } -void test_array() +EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array(Array()) ); diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index c1501947b..6b03abb10 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -11,7 +11,6 @@ template void array_for_matrix(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix ColVectorType; typedef Matrix RowVectorType; @@ -83,7 +82,6 @@ template void array_for_matrix(const MatrixType& m) template void comparisons(const MatrixType& m) { using std::abs; - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -140,7 +138,7 @@ template void comparisons(const MatrixType& m) RealScalar a = m1.cwiseAbs().mean(); VERIFY( ((m1.array()<-a).matrix() || (m1.array()>a).matrix()).count() == (m1.cwiseAbs().array()>a).count()); - typedef Matrix VectorOfIndices; + typedef Matrix VectorOfIndices; // TODO allows colwise/rowwise for array VERIFY_IS_APPROX(((m1.array().abs()+1)>RealScalar(0.1)).matrix().colwise().count(), VectorOfIndices::Constant(cols,rows).transpose()); @@ -172,7 +170,6 @@ template void lpNorm(const VectorType& v) template void cwise_min_max(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -211,7 +208,6 @@ template void cwise_min_max(const MatrixType& m) template void resize(const MatrixTraits& t) { - typedef typename MatrixTraits::Index Index; typedef typename MatrixTraits::Scalar Scalar; typedef Matrix MatrixType; typedef Array Array2DType; @@ -235,13 +231,32 @@ template void resize(const MatrixTraits& t) VERIFY(a1.size()==cols); } +template void regression_bug_654() { ArrayXf a = RowVectorXf(3); VectorXf v = Array(3); } -void test_array_for_matrix() +// Check propagation of LvalueBit through Array/Matrix-Wrapper +template +void regrrssion_bug_1410() +{ + const Matrix4i M; + const Array4i A; + ArrayWrapper MA = M.array(); + MA.row(0); + MatrixWrapper AM = A.matrix(); + AM.row(0); + + VERIFY((internal::traits >::Flags&LvalueBit)==0); + VERIFY((internal::traits >::Flags&LvalueBit)==0); + + VERIFY((internal::traits >::Flags&LvalueBit)==LvalueBit); + VERIFY((internal::traits >::Flags&LvalueBit)==LvalueBit); +} + +EIGEN_DECLARE_TEST(array_for_matrix) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array_for_matrix(Matrix()) ); @@ -280,5 +295,6 @@ void test_array_for_matrix() CALL_SUBTEST_5( resize(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( resize(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - CALL_SUBTEST_6( regression_bug_654() ); + CALL_SUBTEST_6( regression_bug_654<0>() ); + CALL_SUBTEST_6( regrrssion_bug_1410<0>() ); } diff --git a/test/array_of_string.cpp b/test/array_of_string.cpp index e23b7c59e..23e51529b 100644 --- a/test/array_of_string.cpp +++ b/test/array_of_string.cpp @@ -9,7 +9,7 @@ #include "main.h" -void test_array_of_string() +EIGEN_DECLARE_TEST(array_of_string) { typedef Array ArrayXs; ArrayXs a1(3), a2(3), a3(3), a3ref(3); diff --git a/test/array_replicate.cpp b/test/array_replicate.cpp index 779c8fc2f..057c3c77b 100644 --- a/test/array_replicate.cpp +++ b/test/array_replicate.cpp @@ -14,7 +14,6 @@ template void replicate(const MatrixType& m) /* this test covers the following files: Replicate.cpp */ - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix VectorType; typedef Matrix MatrixX; @@ -69,7 +68,7 @@ template void replicate(const MatrixType& m) VERIFY_IS_APPROX(vx1, v1.colwise().replicate(f2)); } -void test_array_replicate() +EIGEN_DECLARE_TEST(array_replicate) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( replicate(Matrix()) ); diff --git a/test/array_reverse.cpp b/test/array_reverse.cpp index c9d9f90c3..e23159def 100644 --- a/test/array_reverse.cpp +++ b/test/array_reverse.cpp @@ -15,7 +15,6 @@ using namespace std; template void reverse(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix VectorType; @@ -124,7 +123,16 @@ template void reverse(const MatrixType& m) VERIFY_IS_APPROX(x, m1(r, cols - 1 - c)); } -void test_array_reverse() +template +void array_reverse_extra() +{ + Vector4f x; x << 1, 2, 3, 4; + Vector4f y; y << 4, 3, 2, 1; + VERIFY(x.reverse()[1] == 3); + VERIFY(x.reverse() == y); +} + +EIGEN_DECLARE_TEST(array_reverse) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( reverse(Matrix()) ); @@ -137,10 +145,5 @@ void test_array_reverse() CALL_SUBTEST_8( reverse(Matrix()) ); CALL_SUBTEST_9( reverse(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } -#ifdef EIGEN_TEST_PART_3 - Vector4f x; x << 1, 2, 3, 4; - Vector4f y; y << 4, 3, 2, 1; - VERIFY(x.reverse()[1] == 3); - VERIFY(x.reverse() == y); -#endif + CALL_SUBTEST_3( array_reverse_extra<0>() ); } diff --git a/test/bandmatrix.cpp b/test/bandmatrix.cpp index f8c38f7c3..66a1b0db4 100644 --- a/test/bandmatrix.cpp +++ b/test/bandmatrix.cpp @@ -59,7 +59,7 @@ template void bandmatrix(const MatrixType& _m) using Eigen::internal::BandMatrix; -void test_bandmatrix() +EIGEN_DECLARE_TEST(bandmatrix) { for(int i = 0; i < 10*g_repeat ; i++) { Index rows = internal::random(1,10); diff --git a/test/basicstuff.cpp b/test/basicstuff.cpp index c346ce6cb..85af603d8 100644 --- a/test/basicstuff.cpp +++ b/test/basicstuff.cpp @@ -13,7 +13,6 @@ template void basicStuff(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix VectorType; typedef Matrix SquareMatrixType; @@ -124,22 +123,22 @@ template void basicStuff(const MatrixType& m) // check automatic transposition sm2.setZero(); - for(typename MatrixType::Index i=0;i void basicStuff(const MatrixType& m) template void basicStuffComplex(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef Matrix RealMatrixType; @@ -196,7 +194,7 @@ template void basicStuffComplex(const MatrixType& m) VERIFY(!static_cast(cm).imag().isZero()); } -#ifdef EIGEN_TEST_PART_2 +template void casting() { Matrix4f m = Matrix4f::Random(), m2; @@ -205,7 +203,6 @@ void casting() m2 = m.cast(); // check the specialization when NewType == Type VERIFY(m.isApprox(m2)); } -#endif template void fixedSizeMatrixConstruction() @@ -270,7 +267,7 @@ void fixedSizeMatrixConstruction() } } -void test_basicstuff() +EIGEN_DECLARE_TEST(basicstuff) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( basicStuff(Matrix()) ); @@ -292,5 +289,5 @@ void test_basicstuff() CALL_SUBTEST_1(fixedSizeMatrixConstruction()); CALL_SUBTEST_1(fixedSizeMatrixConstruction()); - CALL_SUBTEST_2(casting()); + CALL_SUBTEST_2(casting<0>()); } diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index f9f687aac..3065ff015 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -62,7 +62,7 @@ void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computa if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); } -void test_bdcsvd() +EIGEN_DECLARE_TEST(bdcsvd) { CALL_SUBTEST_3(( svd_verify_assert >(Matrix3f()) )); CALL_SUBTEST_4(( svd_verify_assert >(Matrix4d()) )); @@ -104,7 +104,8 @@ void test_bdcsvd() CALL_SUBTEST_7( BDCSVD(10,10) ); // Check that preallocation avoids subsequent mallocs - CALL_SUBTEST_9( svd_preallocate() ); + // Disabled because not supported by BDCSVD + // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); } diff --git a/test/bicgstab.cpp b/test/bicgstab.cpp index 4cc0dd31c..89d6a45ef 100644 --- a/test/bicgstab.cpp +++ b/test/bicgstab.cpp @@ -26,7 +26,7 @@ template void test_bicgstab_T() //CALL_SUBTEST( check_sparse_square_solving(bicgstab_colmajor_ssor) ); } -void test_bicgstab() +EIGEN_DECLARE_TEST(bicgstab) { CALL_SUBTEST_1((test_bicgstab_T()) ); CALL_SUBTEST_2((test_bicgstab_T, int>())); diff --git a/test/block.cpp b/test/block.cpp index d61059874..ca13539a9 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -39,12 +39,11 @@ is_same_block(const T1& a, const T2& b) template void block(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef Matrix VectorType; typedef Matrix RowVectorType; - typedef Matrix DynamicMatrixType; + typedef Matrix DynamicMatrixType; typedef Matrix DynamicVectorType; Index rows = m.rows(); @@ -142,7 +141,7 @@ template void block(const MatrixType& m) VERIFY(numext::real(ones.col(c1).dot(ones.col(c2))) == RealScalar(rows)); VERIFY(numext::real(ones.row(r1).dot(ones.row(r2))) == RealScalar(cols)); - // chekc that linear acccessors works on blocks + // check that linear acccessors works on blocks m1 = m1_copy; if((MatrixType::Flags&RowMajorBit)==0) VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1)); @@ -162,9 +161,25 @@ template void block(const MatrixType& m) // expressions without direct access VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,rows-r1,cols-c1).block(r2-r1,c2-c1,rows-r2,cols-c2)) , ((m1+m2).block(r2,c2,rows-r2,cols-c2)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)) ); + VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).row(0)) , ((m1+m2).eval().row(r1).segment(c1,c2-c1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).col(0)) , ((m1+m2).col(c1).segment(r1,r2-r1+1)) ); VERIFY_IS_APPROX( ((m1+m2).block(r1,c1,r2-r1+1,c2-c1+1).transpose().col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); VERIFY_IS_APPROX( ((m1+m2).transpose().block(c1,r1,c2-c1+1,r2-r1+1).col(0)) , ((m1+m2).row(r1).segment(c1,c2-c1+1)).transpose() ); + VERIFY_IS_APPROX( ((m1+m2).template block(r1,c1,r2-r1+1,1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)) ); + VERIFY_IS_APPROX( ((m1+m2).template block<1,Dynamic>(r1,c1,1,c2-c1+1)) , ((m1+m2).eval().row(r1).eval().segment(c1,c2-c1+1)) ); + VERIFY_IS_APPROX( ((m1+m2).transpose().template block<1,Dynamic>(c1,r1,1,r2-r1+1)) , ((m1+m2).eval().col(c1).eval().segment(r1,r2-r1+1)).transpose() ); + VERIFY_IS_APPROX( (m1+m2).row(r1).eval(), (m1+m2).eval().row(r1) ); + VERIFY_IS_APPROX( (m1+m2).adjoint().col(r1).eval(), (m1+m2).adjoint().eval().col(r1) ); + VERIFY_IS_APPROX( (m1+m2).adjoint().row(c1).eval(), (m1+m2).adjoint().eval().row(c1) ); + VERIFY_IS_APPROX( (m1*1).row(r1).segment(c1,c2-c1+1).eval(), m1.row(r1).eval().segment(c1,c2-c1+1).eval() ); + VERIFY_IS_APPROX( m1.col(c1).reverse().segment(r1,r2-r1+1).eval(),m1.col(c1).reverse().eval().segment(r1,r2-r1+1).eval() ); + + VERIFY_IS_APPROX( (m1*1).topRows(r1), m1.topRows(r1) ); + VERIFY_IS_APPROX( (m1*1).leftCols(c1), m1.leftCols(c1) ); + VERIFY_IS_APPROX( (m1*1).transpose().topRows(c1), m1.transpose().topRows(c1) ); + VERIFY_IS_APPROX( (m1*1).transpose().leftCols(r1), m1.transpose().leftCols(r1) ); + VERIFY_IS_APPROX( (m1*1).transpose().middleRows(c1,c2-c1+1), m1.transpose().middleRows(c1,c2-c1+1) ); + VERIFY_IS_APPROX( (m1*1).transpose().middleCols(r1,r2-r1+1), m1.transpose().middleCols(r1,r2-r1+1) ); // evaluation into plain matrices from expressions with direct access (stress MapBase) DynamicMatrixType dm; @@ -211,7 +226,6 @@ template void block(const MatrixType& m) template void compare_using_data_and_stride(const MatrixType& m) { - typedef typename MatrixType::Index Index; Index rows = m.rows(); Index cols = m.cols(); Index size = m.size(); @@ -245,7 +259,6 @@ void compare_using_data_and_stride(const MatrixType& m) template void data_and_stride(const MatrixType& m) { - typedef typename MatrixType::Index Index; Index rows = m.rows(); Index cols = m.cols(); @@ -263,7 +276,7 @@ void data_and_stride(const MatrixType& m) compare_using_data_and_stride(m1.col(c1).transpose()); } -void test_block() +EIGEN_DECLARE_TEST(block) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( block(Matrix()) ); diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp index e06e9bdaf..579a6fd25 100644 --- a/test/boostmultiprec.cpp +++ b/test/boostmultiprec.cpp @@ -55,6 +55,10 @@ #include "bdcsvd.cpp" #endif +#ifdef EIGEN_TEST_PART_11 +#include "simplicial_cholesky.cpp" +#endif + #include #undef min @@ -141,7 +145,7 @@ namespace Eigen { } -void test_boostmultiprec() +EIGEN_DECLARE_TEST(boostmultiprec) { typedef Matrix Mat; typedef Matrix,Dynamic,Dynamic> MatC; @@ -152,7 +156,7 @@ void test_boostmultiprec() std::cout << "NumTraits::highest() = " << NumTraits::highest() << std::endl; std::cout << "NumTraits::digits10() = " << NumTraits::digits10() << std::endl; - // chekc stream output + // check stream output { Mat A(10,10); A.setRandom(); @@ -197,5 +201,7 @@ void test_boostmultiprec() CALL_SUBTEST_9(( jacobisvd(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); CALL_SUBTEST_10(( bdcsvd(Mat(internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE), internal::random(EIGEN_TEST_MAX_SIZE/4, EIGEN_TEST_MAX_SIZE/2))) )); + + CALL_SUBTEST_11(( test_simplicial_cholesky_T() )); } diff --git a/test/cholesky.cpp b/test/cholesky.cpp index 8ad5ac639..b871351e0 100644 --- a/test/cholesky.cpp +++ b/test/cholesky.cpp @@ -57,7 +57,6 @@ template class CholType> void test_c template void cholesky(const MatrixType& m) { - typedef typename MatrixType::Index Index; /* this test covers the following files: LLT.h LDLT.h */ @@ -289,8 +288,6 @@ template void cholesky_cplx(const MatrixType& m) // test mixing real/scalar types - typedef typename MatrixType::Index Index; - Index rows = m.rows(); Index cols = m.cols(); @@ -373,6 +370,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 1, 2, 2, 1; @@ -380,6 +378,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 0, 0, 0, 0; @@ -387,6 +386,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(ldlt.isNegative()); VERIFY(ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << 0, 0, 0, 1; @@ -394,6 +394,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(!ldlt.isNegative()); VERIFY(ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } { mat << -1, 0, 0, 0; @@ -401,6 +402,7 @@ template void cholesky_definiteness(const MatrixType& m) VERIFY(ldlt.info()==Success); VERIFY(ldlt.isNegative()); VERIFY(!ldlt.isPositive()); + VERIFY_IS_APPROX(mat,ldlt.reconstructedMatrix()); } } @@ -452,6 +454,18 @@ void cholesky_faillure_cases() VERIFY(ldlt.info()==NumericalIssue); VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); } + + // bug 1479 + { + mat.resize(4,4); + mat << 1, 2, 0, 1, + 2, 4, 0, 2, + 0, 0, 0, 1, + 1, 2, 1, 1; + ldlt.compute(mat); + VERIFY(ldlt.info()==NumericalIssue); + VERIFY_IS_NOT_APPROX(mat,ldlt.reconstructedMatrix()); + } } template void cholesky_verify_assert() @@ -474,7 +488,7 @@ template void cholesky_verify_assert() VERIFY_RAISES_ASSERT(ldlt.solveInPlace(&tmp)) } -void test_cholesky() +EIGEN_DECLARE_TEST(cholesky) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/cholmod_support.cpp b/test/cholmod_support.cpp index 931207334..89b9cf41e 100644 --- a/test/cholmod_support.cpp +++ b/test/cholmod_support.cpp @@ -55,7 +55,7 @@ template void test_cholmod_T() test_cholmod_ST >(); } -void test_cholmod_support() +EIGEN_DECLARE_TEST(cholmod_support) { CALL_SUBTEST_11( (test_cholmod_T()) ); CALL_SUBTEST_12( (test_cholmod_T()) ); diff --git a/test/commainitializer.cpp b/test/commainitializer.cpp index 9844adbd2..3cb94da62 100644 --- a/test/commainitializer.cpp +++ b/test/commainitializer.cpp @@ -65,7 +65,7 @@ struct test_block_recursion<-1> static void run() { } }; -void test_commainitializer() +EIGEN_DECLARE_TEST(commainitializer) { Matrix3d m3; Matrix4d m4; diff --git a/test/conjugate_gradient.cpp b/test/conjugate_gradient.cpp index 9622fd86d..47a4ca707 100644 --- a/test/conjugate_gradient.cpp +++ b/test/conjugate_gradient.cpp @@ -26,7 +26,7 @@ template void test_conjugate_gradient_T() CALL_SUBTEST( check_sparse_spd_solving(cg_colmajor_upper_I) ); } -void test_conjugate_gradient() +EIGEN_DECLARE_TEST(conjugate_gradient) { CALL_SUBTEST_1(( test_conjugate_gradient_T() )); CALL_SUBTEST_2(( test_conjugate_gradient_T, int>() )); diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp index 498421b4c..5dc500068 100644 --- a/test/conservative_resize.cpp +++ b/test/conservative_resize.cpp @@ -10,6 +10,7 @@ #include "main.h" #include +#include "AnnoyingScalar.h" using namespace Eigen; @@ -17,7 +18,6 @@ template void run_matrix_tests() { typedef Matrix MatrixType; - typedef typename MatrixType::Index Index; MatrixType m, n; @@ -110,7 +110,31 @@ void run_vector_tests() } } -void test_conservative_resize() +// Basic memory leak check with a non-copyable scalar type +template void noncopyable() +{ + typedef Eigen::Matrix VectorType; + typedef Eigen::Matrix MatrixType; + + { + AnnoyingScalar::dont_throw = true; + int n = 50; + VectorType v0(n), v1(n); + MatrixType m0(n,n), m1(n,n), m2(n,n); + v0.setOnes(); v1.setOnes(); + m0.setOnes(); m1.setOnes(); m2.setOnes(); + VERIFY(m0==m1); + m0.conservativeResize(2*n,2*n); + VERIFY(m0.topLeftCorner(n,n) == m1); + + VERIFY(v0.head(n) == v1); + v0.conservativeResize(2*n); + VERIFY(v0.head(n) == v1); + } + VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in noncopyable"); +} + +EIGEN_DECLARE_TEST(conservative_resize) { for(int i=0; i, Eigen::RowMajor>())); CALL_SUBTEST_4((run_matrix_tests, Eigen::ColMajor>())); CALL_SUBTEST_5((run_matrix_tests, Eigen::RowMajor>())); - CALL_SUBTEST_6((run_matrix_tests, Eigen::ColMajor>())); + CALL_SUBTEST_5((run_matrix_tests, Eigen::ColMajor>())); CALL_SUBTEST_1((run_vector_tests())); CALL_SUBTEST_2((run_vector_tests())); CALL_SUBTEST_3((run_vector_tests())); CALL_SUBTEST_4((run_vector_tests >())); CALL_SUBTEST_5((run_vector_tests >())); + + AnnoyingScalar::dont_throw = true; + CALL_SUBTEST_6(( run_vector_tests() )); + CALL_SUBTEST_6(( noncopyable<0>() )); } } diff --git a/test/constructor.cpp b/test/constructor.cpp index eec9e2192..1dd3bc3c0 100644 --- a/test/constructor.cpp +++ b/test/constructor.cpp @@ -37,7 +37,7 @@ template void ctor_init1(const MatrixType& m) } -void test_constructor() +EIGEN_DECLARE_TEST(constructor) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( ctor_init1(Matrix()) ); diff --git a/test/corners.cpp b/test/corners.cpp index 3c64c32a1..73342a8dd 100644 --- a/test/corners.cpp +++ b/test/corners.cpp @@ -15,7 +15,6 @@ template void corners(const MatrixType& m) { - typedef typename MatrixType::Index Index; Index rows = m.rows(); Index cols = m.cols(); @@ -102,7 +101,7 @@ template void c VERIFY_IS_EQUAL((const_matrix.template rightCols()), (const_matrix.template block(0,cols-c))); } -void test_corners() +EIGEN_DECLARE_TEST(corners) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( corners(Matrix()) ); diff --git a/test/ctorleak.cpp b/test/ctorleak.cpp index c158f5e4e..7202e90dd 100644 --- a/test/ctorleak.cpp +++ b/test/ctorleak.cpp @@ -33,7 +33,7 @@ Index Foo::object_limit = 0; #undef EIGEN_TEST_MAX_SIZE #define EIGEN_TEST_MAX_SIZE 3 -void test_ctorleak() +EIGEN_DECLARE_TEST(ctorleak) { typedef Matrix MatrixX; typedef Matrix VectorX; diff --git a/test/cuda_common.h b/test/cuda_common.h deleted file mode 100644 index 9737693ac..000000000 --- a/test/cuda_common.h +++ /dev/null @@ -1,101 +0,0 @@ - -#ifndef EIGEN_TEST_CUDA_COMMON_H -#define EIGEN_TEST_CUDA_COMMON_H - -#include -#include -#include -#include - -#ifndef __CUDACC__ -dim3 threadIdx, blockDim, blockIdx; -#endif - -template -void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) -{ - for(int i=0; i -__global__ -void run_on_cuda_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) -{ - int i = threadIdx.x + blockIdx.x*blockDim.x; - if(i -void run_on_cuda(const Kernel& ker, int n, const Input& in, Output& out) -{ - typename Input::Scalar* d_in; - typename Output::Scalar* d_out; - std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); - std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); - - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); - - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_out, out.data(), out_bytes, cudaMemcpyHostToDevice); - - // Simple and non-optimal 1D mapping assuming n is not too large - // That's only for unit testing! - dim3 Blocks(128); - dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); - - cudaThreadSynchronize(); - run_on_cuda_meta_kernel<<>>(ker, n, d_in, d_out); - cudaThreadSynchronize(); - - // check inputs have not been modified - cudaMemcpy(const_cast(in.data()), d_in, in_bytes, cudaMemcpyDeviceToHost); - cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost); - - cudaFree(d_in); - cudaFree(d_out); -} - - -template -void run_and_compare_to_cuda(const Kernel& ker, int n, const Input& in, Output& out) -{ - Input in_ref, in_cuda; - Output out_ref, out_cuda; - #ifndef __CUDA_ARCH__ - in_ref = in_cuda = in; - out_ref = out_cuda = out; - #endif - run_on_cpu (ker, n, in_ref, out_ref); - run_on_cuda(ker, n, in_cuda, out_cuda); - #ifndef __CUDA_ARCH__ - VERIFY_IS_APPROX(in_ref, in_cuda); - VERIFY_IS_APPROX(out_ref, out_cuda); - #endif -} - - -void ei_test_init_cuda() -{ - int device = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, device); - std::cout << "CUDA device info:\n"; - std::cout << " name: " << deviceProp.name << "\n"; - std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; - std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; - std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; - std::cout << " warpSize: " << deviceProp.warpSize << "\n"; - std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; - std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; - std::cout << " clockRate: " << deviceProp.clockRate << "\n"; - std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; - std::cout << " computeMode: " << deviceProp.computeMode << "\n"; -} - -#endif // EIGEN_TEST_CUDA_COMMON_H diff --git a/test/denseLM.cpp b/test/denseLM.cpp index 0aa736ea3..afb8004b1 100644 --- a/test/denseLM.cpp +++ b/test/denseLM.cpp @@ -182,7 +182,7 @@ void test_denseLM_T() } -void test_denseLM() +EIGEN_DECLARE_TEST(denseLM) { CALL_SUBTEST_2(test_denseLM_T()); diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index e63712b1a..1150ec52b 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -52,7 +52,7 @@ void dense_storage_assignment() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } -void test_dense_storage() +EIGEN_DECLARE_TEST(dense_storage) { dense_storage_copy(); dense_storage_copy(); diff --git a/test/determinant.cpp b/test/determinant.cpp index 758f3afbb..7dd33c373 100644 --- a/test/determinant.cpp +++ b/test/determinant.cpp @@ -16,7 +16,6 @@ template void determinant(const MatrixType& m) /* this test covers the following files: Determinant.h */ - typedef typename MatrixType::Index Index; Index size = m.rows(); MatrixType m1(size, size), m2(size, size); @@ -51,7 +50,7 @@ template void determinant(const MatrixType& m) VERIFY_IS_APPROX(m2.block(0,0,0,0).determinant(), Scalar(1)); } -void test_determinant() +EIGEN_DECLARE_TEST(determinant) { for(int i = 0; i < g_repeat; i++) { int s = 0; diff --git a/test/diagonal.cpp b/test/diagonal.cpp index c1546e97d..4e8c4b3c9 100644 --- a/test/diagonal.cpp +++ b/test/diagonal.cpp @@ -11,7 +11,6 @@ template void diagonal(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -66,6 +65,9 @@ template void diagonal(const MatrixType& m) m2.diagonal(N2).coeffRef(0) = Scalar(2)*s1; VERIFY_IS_APPROX(m2.diagonal(N2).coeff(0), Scalar(2)*s1); } + + VERIFY( m1.diagonal( cols).size()==0 ); + VERIFY( m1.diagonal(-rows).size()==0 ); } template void diagonal_assert(const MatrixType& m) { @@ -81,9 +83,12 @@ template void diagonal_assert(const MatrixType& m) { VERIFY_RAISES_ASSERT( m1.array() *= m1.diagonal().array() ); VERIFY_RAISES_ASSERT( m1.array() /= m1.diagonal().array() ); } + + VERIFY_RAISES_ASSERT( m1.diagonal(cols+1) ); + VERIFY_RAISES_ASSERT( m1.diagonal(-(rows+1)) ); } -void test_diagonal() +EIGEN_DECLARE_TEST(diagonal) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonal(Matrix()) ); @@ -95,7 +100,6 @@ void test_diagonal() CALL_SUBTEST_2( diagonal(MatrixXcd(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_1( diagonal(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_1( diagonal(Matrix(3, 4)) ); + CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - - CALL_SUBTEST_1( diagonal_assert(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } diff --git a/test/diagonalmatrices.cpp b/test/diagonalmatrices.cpp index cd6dc8cf0..ba58ca8d1 100644 --- a/test/diagonalmatrices.cpp +++ b/test/diagonalmatrices.cpp @@ -11,7 +11,6 @@ using namespace std; template void diagonalmatrices(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; enum { Rows = MatrixType::RowsAtCompileTime, Cols = MatrixType::ColsAtCompileTime }; typedef Matrix VectorType; @@ -30,6 +29,7 @@ template void diagonalmatrices(const MatrixType& m) v2 = VectorType::Random(rows); RowVectorType rv1 = RowVectorType::Random(cols), rv2 = RowVectorType::Random(cols); + LeftDiagonalMatrix ldm1(v1), ldm2(v2); RightDiagonalMatrix rdm1(rv1), rdm2(rv2); @@ -99,6 +99,38 @@ template void diagonalmatrices(const MatrixType& m) VERIFY_IS_APPROX( (sq_m1 += (s1*v1).asDiagonal()), sq_m2 += (s1*v1).asDiagonal().toDenseMatrix() ); VERIFY_IS_APPROX( (sq_m1 -= (s1*v1).asDiagonal()), sq_m2 -= (s1*v1).asDiagonal().toDenseMatrix() ); VERIFY_IS_APPROX( (sq_m1 = (s1*v1).asDiagonal()), (s1*v1).asDiagonal().toDenseMatrix() ); + + sq_m1.setRandom(); + sq_m2 = v1.asDiagonal(); + sq_m2 = sq_m1 * sq_m2; + VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).col(i), sq_m2.col(i) ); + VERIFY_IS_APPROX( (sq_m1*v1.asDiagonal()).row(i), sq_m2.row(i) ); +} + +template void as_scalar_product(const MatrixType& m) +{ + typedef typename MatrixType::Scalar Scalar; + typedef Matrix VectorType; + typedef Matrix DynMatrixType; + typedef Matrix DynVectorType; + typedef Matrix DynRowVectorType; + + Index rows = m.rows(); + Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); + + VectorType v1 = VectorType::Random(rows); + DynVectorType dv1 = DynVectorType::Random(depth); + DynRowVectorType drv1 = DynRowVectorType::Random(depth); + DynMatrixType dm1 = dv1; + DynMatrixType drm1 = drv1; + + Scalar s = v1(0); + + VERIFY_IS_APPROX( v1.asDiagonal() * drv1, s*drv1 ); + VERIFY_IS_APPROX( dv1 * v1.asDiagonal(), dv1*s ); + + VERIFY_IS_APPROX( v1.asDiagonal() * drm1, s*drm1 ); + VERIFY_IS_APPROX( dm1 * v1.asDiagonal(), dm1*s ); } template @@ -112,18 +144,23 @@ void bug987() VERIFY_IS_APPROX(( res1 = points.topLeftCorner<2,2>()*diag.asDiagonal()) , res2 = tmp2*diag.asDiagonal() ); } -void test_diagonalmatrices() +EIGEN_DECLARE_TEST(diagonalmatrices) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( diagonalmatrices(Matrix()) ); + CALL_SUBTEST_1( as_scalar_product(Matrix()) ); + CALL_SUBTEST_2( diagonalmatrices(Matrix3f()) ); CALL_SUBTEST_3( diagonalmatrices(Matrix()) ); CALL_SUBTEST_4( diagonalmatrices(Matrix4d()) ); CALL_SUBTEST_5( diagonalmatrices(Matrix()) ); CALL_SUBTEST_6( diagonalmatrices(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( as_scalar_product(MatrixXcf(1,1)) ); CALL_SUBTEST_7( diagonalmatrices(MatrixXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_8( diagonalmatrices(Matrix(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_9( diagonalmatrices(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_9( diagonalmatrices(MatrixXf(1,1)) ); + CALL_SUBTEST_9( as_scalar_product(MatrixXf(1,1)) ); } CALL_SUBTEST_10( bug987<0>() ); } diff --git a/test/dontalign.cpp b/test/dontalign.cpp index 4643cfed6..2e4102b86 100644 --- a/test/dontalign.cpp +++ b/test/dontalign.cpp @@ -19,7 +19,6 @@ template void dontalign(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix VectorType; typedef Matrix SquareMatrixType; @@ -45,7 +44,7 @@ void dontalign(const MatrixType& m) internal::aligned_delete(array, rows); } -void test_dontalign() +EIGEN_DECLARE_TEST(dontalign) { #if defined EIGEN_TEST_PART_1 || defined EIGEN_TEST_PART_5 dontalign(Matrix3d()); diff --git a/test/dynalloc.cpp b/test/dynalloc.cpp index f1cc70bee..1c74866ba 100644 --- a/test/dynalloc.cpp +++ b/test/dynalloc.cpp @@ -15,6 +15,7 @@ #define ALIGNMENT 1 #endif +typedef Matrix Vector16f; typedef Matrix Vector8f; void check_handmade_aligned_malloc() @@ -70,7 +71,7 @@ struct MyStruct { EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector8f avec; + Vector16f avec; }; class MyClassA @@ -78,7 +79,7 @@ class MyClassA public: EIGEN_MAKE_ALIGNED_OPERATOR_NEW char dummychar; - Vector8f avec; + Vector16f avec; }; template void check_dynaligned() @@ -119,7 +120,7 @@ template void check_custom_new_delete() #endif } -void test_dynalloc() +EIGEN_DECLARE_TEST(dynalloc) { // low level dynamic memory allocation CALL_SUBTEST(check_handmade_aligned_malloc()); @@ -145,6 +146,7 @@ void test_dynalloc() CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); CALL_SUBTEST(check_dynaligned() ); + CALL_SUBTEST(check_dynaligned() ); } { diff --git a/test/eigen2support.cpp b/test/eigen2support.cpp index ad1d98091..49d7328e9 100644 --- a/test/eigen2support.cpp +++ b/test/eigen2support.cpp @@ -13,7 +13,6 @@ template void eigen2support(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -53,7 +52,7 @@ template void eigen2support(const MatrixType& m) m1.minor(0,0); } -void test_eigen2support() +EIGEN_DECLARE_TEST(eigen2support) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( eigen2support(Matrix()) ); diff --git a/test/eigensolver_complex.cpp b/test/eigensolver_complex.cpp index 293b1b265..c5373f420 100644 --- a/test/eigensolver_complex.cpp +++ b/test/eigensolver_complex.cpp @@ -47,7 +47,7 @@ template bool find_pivot(typename MatrixType::Scalar tol, M return false; } -/* Check that two column vectors are approximately equal upto permutations. +/* Check that two column vectors are approximately equal up to permutations. * Initially, this method checked that the k-th power sums are equal for all k = 1, ..., vec1.rows(), * however this strategy is numerically inacurate because of numerical cancellation issues. */ @@ -71,7 +71,6 @@ void verify_is_approx_upto_permutation(const VectorType& vec1, const VectorType& template void eigensolver(const MatrixType& m) { - typedef typename MatrixType::Index Index; /* this test covers the following files: ComplexEigenSolver.h, and indirectly ComplexSchur.h */ @@ -153,7 +152,7 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.eigenvectors()); } -void test_eigensolver_complex() +EIGEN_DECLARE_TEST(eigensolver_complex) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp index 9c0838ba4..95ed431db 100644 --- a/test/eigensolver_generalized_real.cpp +++ b/test/eigensolver_generalized_real.cpp @@ -15,7 +15,6 @@ template void generalized_eigensolver_real(const MatrixType& m) { - typedef typename MatrixType::Index Index; /* this test covers the following files: GeneralizedEigenSolver.h */ @@ -77,9 +76,16 @@ template void generalized_eigensolver_real(const MatrixType GeneralizedEigenSolver eig2(a.adjoint() * a,b.adjoint() * b); eig2.compute(a.adjoint() * a,b.adjoint() * b); } + + // check without eigenvectors + { + GeneralizedEigenSolver eig1(spdA, spdB, true); + GeneralizedEigenSolver eig2(spdA, spdB, false); + VERIFY_IS_APPROX(eig1.eigenvalues(), eig2.eigenvalues()); + } } -void test_eigensolver_generalized_real() +EIGEN_DECLARE_TEST(eigensolver_generalized_real) { for(int i = 0; i < g_repeat; i++) { int s = 0; diff --git a/test/eigensolver_generic.cpp b/test/eigensolver_generic.cpp index d0e644d4b..e0e435151 100644 --- a/test/eigensolver_generic.cpp +++ b/test/eigensolver_generic.cpp @@ -14,7 +14,6 @@ template void eigensolver(const MatrixType& m) { - typedef typename MatrixType::Index Index; /* this test covers the following files: EigenSolver.h */ @@ -101,7 +100,35 @@ template void eigensolver_verify_assert(const MatrixType& m VERIFY_RAISES_ASSERT(eig.pseudoEigenvectors()); } -void test_eigensolver_generic() +template +void eigensolver_generic_extra() +{ + { + // regression test for bug 793 + MatrixXd a(3,3); + a << 0, 0, 1, + 1, 1, 1, + 1, 1e+200, 1; + Eigen::EigenSolver eig(a); + double scale = 1e-200; // scale to avoid overflow during the comparisons + VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale); + VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale); + } + { + // check a case where all eigenvalues are null. + MatrixXd a(2,2); + a << 1, 1, + -1, -1; + Eigen::EigenSolver eig(a); + VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.); + VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.); + VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.); + VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.); + VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.); + } +} + +EIGEN_DECLARE_TEST(eigensolver_generic) { int s = 0; for(int i = 0; i < g_repeat; i++) { @@ -136,31 +163,7 @@ void test_eigensolver_generic() } ); -#ifdef EIGEN_TEST_PART_2 - { - // regression test for bug 793 - MatrixXd a(3,3); - a << 0, 0, 1, - 1, 1, 1, - 1, 1e+200, 1; - Eigen::EigenSolver eig(a); - double scale = 1e-200; // scale to avoid overflow during the comparisons - VERIFY_IS_APPROX(a * eig.pseudoEigenvectors()*scale, eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()*scale); - VERIFY_IS_APPROX(a * eig.eigenvectors()*scale, eig.eigenvectors() * eig.eigenvalues().asDiagonal()*scale); - } - { - // check a case where all eigenvalues are null. - MatrixXd a(2,2); - a << 1, 1, - -1, -1; - Eigen::EigenSolver eig(a); - VERIFY_IS_APPROX(eig.pseudoEigenvectors().squaredNorm(), 2.); - VERIFY_IS_APPROX((a * eig.pseudoEigenvectors()).norm()+1., 1.); - VERIFY_IS_APPROX((eig.pseudoEigenvectors() * eig.pseudoEigenvalueMatrix()).norm()+1., 1.); - VERIFY_IS_APPROX((a * eig.eigenvectors()).norm()+1., 1.); - VERIFY_IS_APPROX((eig.eigenvectors() * eig.eigenvalues().asDiagonal()).norm()+1., 1.); - } -#endif + CALL_SUBTEST_2( eigensolver_generic_extra<0>() ); TEST_SET_BUT_UNUSED_VARIABLE(s) } diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 39ad4130e..65b80c3fb 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -68,7 +68,6 @@ template void selfadjointeigensolver_essential_check(const template void selfadjointeigensolver(const MatrixType& m) { - typedef typename MatrixType::Index Index; /* this test covers the following files: EigenSolver.h, SelfAdjointEigenSolver.h (and indirectly: Tridiagonalization.h) */ @@ -231,7 +230,7 @@ void bug_1204() SelfAdjointEigenSolver > eig(A); } -void test_eigensolver_selfadjoint() +EIGEN_DECLARE_TEST(eigensolver_selfadjoint) { int s = 0; for(int i = 0; i < g_repeat; i++) { diff --git a/test/evaluators.cpp b/test/evaluators.cpp index aed5a05a7..f4fdaf053 100644 --- a/test/evaluators.cpp +++ b/test/evaluators.cpp @@ -101,7 +101,7 @@ using namespace std; #define VERIFY_IS_APPROX_EVALUATOR(DEST,EXPR) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (EXPR).eval()); #define VERIFY_IS_APPROX_EVALUATOR2(DEST,EXPR,REF) VERIFY_IS_APPROX(copy_using_evaluator(DEST,(EXPR)), (REF).eval()); -void test_evaluators() +EIGEN_DECLARE_TEST(evaluators) { // Testing Matrix evaluator and Transpose Vector2d v = Vector2d::Random(); diff --git a/test/exceptions.cpp b/test/exceptions.cpp index b83fb82ba..3d93060ab 100644 --- a/test/exceptions.cpp +++ b/test/exceptions.cpp @@ -8,93 +8,34 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// Various sanity tests with exceptions: +// Various sanity tests with exceptions and non trivially copyable scalar type. // - no memory leak when a custom scalar type trow an exceptions // - todo: complete the list of tests! #define EIGEN_STACK_ALLOCATION_LIMIT 100000000 #include "main.h" - -struct my_exception -{ - my_exception() {} - ~my_exception() {} -}; - -class ScalarWithExceptions -{ - public: - ScalarWithExceptions() { init(); } - ScalarWithExceptions(const float& _v) { init(); *v = _v; } - ScalarWithExceptions(const ScalarWithExceptions& other) { init(); *v = *(other.v); } - ~ScalarWithExceptions() { - delete v; - instances--; - } - - void init() { - v = new float; - instances++; - } - - ScalarWithExceptions operator+(const ScalarWithExceptions& other) const - { - countdown--; - if(countdown<=0) - throw my_exception(); - return ScalarWithExceptions(*v+*other.v); - } - - ScalarWithExceptions operator-(const ScalarWithExceptions& other) const - { return ScalarWithExceptions(*v-*other.v); } - - ScalarWithExceptions operator*(const ScalarWithExceptions& other) const - { return ScalarWithExceptions((*v)*(*other.v)); } - - ScalarWithExceptions& operator+=(const ScalarWithExceptions& other) - { *v+=*other.v; return *this; } - ScalarWithExceptions& operator-=(const ScalarWithExceptions& other) - { *v-=*other.v; return *this; } - ScalarWithExceptions& operator=(const ScalarWithExceptions& other) - { *v = *(other.v); return *this; } - - bool operator==(const ScalarWithExceptions& other) const - { return *v==*other.v; } - bool operator!=(const ScalarWithExceptions& other) const - { return *v!=*other.v; } - - float* v; - static int instances; - static int countdown; -}; - -ScalarWithExceptions real(const ScalarWithExceptions &x) { return x; } -ScalarWithExceptions imag(const ScalarWithExceptions & ) { return 0; } -ScalarWithExceptions conj(const ScalarWithExceptions &x) { return x; } - -int ScalarWithExceptions::instances = 0; -int ScalarWithExceptions::countdown = 0; - +#include "AnnoyingScalar.h" #define CHECK_MEMLEAK(OP) { \ - ScalarWithExceptions::countdown = 100; \ - int before = ScalarWithExceptions::instances; \ - bool exception_thrown = false; \ - try { OP; } \ + AnnoyingScalar::countdown = 100; \ + int before = AnnoyingScalar::instances; \ + bool exception_thrown = false; \ + try { OP; } \ catch (my_exception) { \ exception_thrown = true; \ - VERIFY(ScalarWithExceptions::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \ + VERIFY(AnnoyingScalar::instances==before && "memory leak detected in " && EIGEN_MAKESTRING(OP)); \ } \ - VERIFY(exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)); \ + VERIFY( (AnnoyingScalar::dont_throw) || (exception_thrown && " no exception thrown in " && EIGEN_MAKESTRING(OP)) ); \ } -void memoryleak() +EIGEN_DECLARE_TEST(exceptions) { - typedef Eigen::Matrix VectorType; - typedef Eigen::Matrix MatrixType; + typedef Eigen::Matrix VectorType; + typedef Eigen::Matrix MatrixType; { + AnnoyingScalar::dont_throw = false; int n = 50; VectorType v0(n), v1(n); MatrixType m0(n,n), m1(n,n), m2(n,n); @@ -104,10 +45,5 @@ void memoryleak() CHECK_MEMLEAK(m2 = m0 * m1 * m2); CHECK_MEMLEAK((v0+v1).dot(v0+v1)); } - VERIFY(ScalarWithExceptions::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); \ -} - -void test_exceptions() -{ - CALL_SUBTEST( memoryleak() ); + VERIFY(AnnoyingScalar::instances==0 && "global memory leak detected in " && EIGEN_MAKESTRING(OP)); } diff --git a/test/fastmath.cpp b/test/fastmath.cpp index cc5db0746..c30f0a846 100644 --- a/test/fastmath.cpp +++ b/test/fastmath.cpp @@ -88,7 +88,7 @@ void check_inf_nan(bool dryrun) { } } -void test_fastmath() { +EIGEN_DECLARE_TEST(fastmath) { std::cout << "*** float *** \n\n"; check_inf_nan(true); std::cout << "*** double ***\n\n"; check_inf_nan(true); std::cout << "*** long double *** \n\n"; check_inf_nan(true); diff --git a/test/first_aligned.cpp b/test/first_aligned.cpp index ae2d4bc42..ed9945077 100644 --- a/test/first_aligned.cpp +++ b/test/first_aligned.cpp @@ -26,7 +26,7 @@ void test_none_aligned_helper(Scalar *array, int size) struct some_non_vectorizable_type { float x; }; -void test_first_aligned() +EIGEN_DECLARE_TEST(first_aligned) { EIGEN_ALIGN16 float array_float[100]; test_first_aligned_helper(array_float, 50); diff --git a/test/geo_alignedbox.cpp b/test/geo_alignedbox.cpp index 223ff5eea..c6c051ce4 100644 --- a/test/geo_alignedbox.cpp +++ b/test/geo_alignedbox.cpp @@ -33,7 +33,6 @@ template void alignedbox(const BoxType& _box) /* this test covers the following files: AlignedBox.h */ - typedef typename BoxType::Index Index; typedef typename BoxType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef Matrix VectorType; @@ -95,7 +94,6 @@ template void alignedboxCastTests(const BoxType& _box) { // casting - typedef typename BoxType::Index Index; typedef typename BoxType::Scalar Scalar; typedef Matrix VectorType; @@ -171,7 +169,7 @@ void specificTest2() } -void test_geo_alignedbox() +EIGEN_DECLARE_TEST(geo_alignedbox) { for(int i = 0; i < g_repeat; i++) { diff --git a/test/geo_eulerangles.cpp b/test/geo_eulerangles.cpp index 932ebe773..693c627a9 100644 --- a/test/geo_eulerangles.cpp +++ b/test/geo_eulerangles.cpp @@ -103,7 +103,7 @@ template void eulerangles() check_all_var(ea); } -void test_geo_eulerangles() +EIGEN_DECLARE_TEST(geo_eulerangles) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( eulerangles() ); diff --git a/test/geo_homogeneous.cpp b/test/geo_homogeneous.cpp index 2187c7bf9..9aebe6226 100644 --- a/test/geo_homogeneous.cpp +++ b/test/geo_homogeneous.cpp @@ -115,7 +115,7 @@ template void homogeneous(void) VERIFY_IS_APPROX( (t2.template triangularView() * v0.homogeneous()).eval(), (t2.template triangularView()*hv0) ); } -void test_geo_homogeneous() +EIGEN_DECLARE_TEST(geo_homogeneous) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( homogeneous() )); diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index 27892850d..a26709301 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -19,7 +19,6 @@ template void hyperplane(const HyperplaneType& _plane) Hyperplane.h */ using std::abs; - typedef typename HyperplaneType::Index Index; const Index dim = _plane.dim(); enum { Options = HyperplaneType::Options }; typedef typename HyperplaneType::Scalar Scalar; @@ -181,7 +180,7 @@ template void hyperplane_alignment() } -void test_geo_hyperplane() +EIGEN_DECLARE_TEST(geo_hyperplane) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( hyperplane(Hyperplane()) ); diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp index e178df257..b7b660740 100644 --- a/test/geo_orthomethods.cpp +++ b/test/geo_orthomethods.cpp @@ -115,7 +115,7 @@ template void orthomethods(int size=Size) VERIFY_IS_APPROX(mcrossN3.row(i), matN3.row(i).cross(vec3)); } -void test_geo_orthomethods() +EIGEN_DECLARE_TEST(geo_orthomethods) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( orthomethods_3() ); diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 29c1b105c..7135c8fa5 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -19,7 +19,6 @@ template void parametrizedline(const LineType& _line) ParametrizedLine.h */ using std::abs; - typedef typename LineType::Index Index; const Index dim = _line.dim(); typedef typename LineType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; @@ -118,7 +117,7 @@ template void parametrizedline_alignment() #endif } -void test_geo_parametrizedline() +EIGEN_DECLARE_TEST(geo_parametrizedline) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( parametrizedline(ParametrizedLine()) ); diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index 96889e722..27219db10 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "AnnoyingScalar.h" template T bounded_acos(T v) { @@ -85,7 +86,7 @@ template void quaternion(void) if (refangle>Scalar(EIGEN_PI)) refangle = Scalar(2)*Scalar(EIGEN_PI) - refangle; - if((q1.coeffs()-q2.coeffs()).norm() > 10*largeEps) + if((q1.coeffs()-q2.coeffs()).norm() > Scalar(10)*largeEps) { VERIFY_IS_MUCH_SMALLER_THAN(abs(q1.angularDistance(q2) - refangle), Scalar(1)); } @@ -113,7 +114,7 @@ template void quaternion(void) // Do not execute the test if the rotation angle is almost zero, or // the rotation axis and v1 are almost parallel. - if (abs(aa.angle()) > 5*test_precision() + if (abs(aa.angle()) > Scalar(5)*test_precision() && (aa.axis() - v1.normalized()).norm() < Scalar(1.99) && (aa.axis() + v1.normalized()).norm() < Scalar(1.99)) { @@ -231,6 +232,19 @@ template void mapQuaternion(void){ VERIFY_IS_APPROX(mq3*mq2, q3*q2); VERIFY_IS_APPROX(mcq1*mq2, q1*q2); VERIFY_IS_APPROX(mcq3*mq2, q3*q2); + + // Bug 1461, compilation issue with Map::w(), and other reference/constness checks: + VERIFY_IS_APPROX(mcq3.coeffs().x() + mcq3.coeffs().y() + mcq3.coeffs().z() + mcq3.coeffs().w(), mcq3.coeffs().sum()); + VERIFY_IS_APPROX(mcq3.x() + mcq3.y() + mcq3.z() + mcq3.w(), mcq3.coeffs().sum()); + mq3.w() = 1; + const Quaternionx& cq3(q3); + VERIFY( &cq3.x() == &q3.x() ); + const MQuaternionUA& cmq3(mq3); + VERIFY( &cmq3.x() == &mq3.x() ); + // FIXME the following should be ok. The problem is that currently the LValueBit flag + // is used to determine whether we can return a coeff by reference or not, which is not enough for Map. + //const MCQuaternionUA& cmcq3(mcq3); + //VERIFY( &cmcq3.x() == &mcq3.x() ); } template void quaternionAlignment(void){ @@ -272,18 +286,38 @@ template void check_const_correctness(const PlainObjec VERIFY( !(Map::Flags & LvalueBit) ); } -void test_geo_quaternion() +#if EIGEN_HAS_RVALUE_REFERENCES + +// Regression for bug 1573 +struct MovableClass { + // The following line is a workaround for gcc 4.7 and 4.8 (see bug 1573 comments). + static_assert(std::is_nothrow_move_constructible::value,""); + MovableClass() = default; + MovableClass(const MovableClass&) = default; + MovableClass(MovableClass&&) noexcept = default; + MovableClass& operator=(const MovableClass&) = default; + MovableClass& operator=(MovableClass&&) = default; + Quaternionf m_quat; +}; + +#endif + +EIGEN_DECLARE_TEST(geo_quaternion) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( quaternion() )); CALL_SUBTEST_1( check_const_correctness(Quaternionf()) ); + CALL_SUBTEST_1(( quaternion() )); + CALL_SUBTEST_1(( quaternionAlignment() )); + CALL_SUBTEST_1( mapQuaternion() ); + CALL_SUBTEST_2(( quaternion() )); CALL_SUBTEST_2( check_const_correctness(Quaterniond()) ); - CALL_SUBTEST_3(( quaternion() )); - CALL_SUBTEST_4(( quaternion() )); - CALL_SUBTEST_5(( quaternionAlignment() )); - CALL_SUBTEST_6(( quaternionAlignment() )); - CALL_SUBTEST_1( mapQuaternion() ); + CALL_SUBTEST_2(( quaternion() )); + CALL_SUBTEST_2(( quaternionAlignment() )); CALL_SUBTEST_2( mapQuaternion() ); + + AnnoyingScalar::dont_throw = true; + CALL_SUBTEST_3(( quaternion() )); } } diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index 278e527c2..bf920696b 100755 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -612,7 +612,7 @@ template void transform_products() VERIFY_IS_APPROX((ac*p).matrix(), a_m*p_m); } -void test_geo_transformations() +EIGEN_DECLARE_TEST(geo_transformations) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( transformations() )); diff --git a/test/cuda_basic.cu b/test/gpu_basic.cu similarity index 60% rename from test/cuda_basic.cu rename to test/gpu_basic.cu index cb2e4167a..e8069f185 100644 --- a/test/cuda_basic.cu +++ b/test/gpu_basic.cu @@ -15,16 +15,10 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cuda_basic #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int -#include -#include -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include -#endif #include "main.h" -#include "cuda_common.h" +#include "gpu_common.h" // Check that dense modules can be properly parsed by nvcc #include @@ -123,6 +117,22 @@ struct diagonal { } }; +template +struct eigenvalues_direct { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef Matrix Vec; + T M(in+i); + Map res(out+i*Vec::MaxSizeAtCompileTime); + T A = M*M.adjoint(); + SelfAdjointEigenSolver eig; + eig.computeDirect(A); + res = eig.eigenvalues(); + } +}; + template struct eigenvalues { EIGEN_DEVICE_FUNC @@ -134,40 +144,71 @@ struct eigenvalues { Map res(out+i*Vec::MaxSizeAtCompileTime); T A = M*M.adjoint(); SelfAdjointEigenSolver eig; - eig.computeDirect(M); + eig.compute(A); res = eig.eigenvalues(); } }; -void test_cuda_basic() +template +struct matrix_inverse { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + T M(in+i); + Map res(out+i*T::MaxSizeAtCompileTime); + res = M.inverse(); + } +}; + +EIGEN_DECLARE_TEST(gpu_basic) { - ei_test_init_cuda(); + ei_test_init_gpu(); int nthreads = 100; Eigen::VectorXf in, out; - #ifndef __CUDA_ARCH__ + #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int data_size = nthreads * 512; in.setRandom(data_size); out.setRandom(data_size); #endif - CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(coeff_wise(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(replicate(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(replicate(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(redux(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(redux(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(prod_test(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(prod_test(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(diagonal(), nthreads, in, out) ); - - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); - CALL_SUBTEST( run_and_compare_to_cuda(eigenvalues(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise(), nthreads, in, out) ); +#if !defined(EIGEN_USE_HIP) + // FIXME + // These subtests result in a compile failure on the HIP platform + // + // eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error: + // base class 'internal::dense_xpr_base, -1, -1> >::type' + // (aka 'ArrayBase, -1, -1> >') has protected default constructor + CALL_SUBTEST( run_and_compare_to_gpu(replicate(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(replicate(), nthreads, in, out) ); +#endif + + CALL_SUBTEST( run_and_compare_to_gpu(redux(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(redux(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_gpu(prod_test(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(prod_test(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_gpu(diagonal(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(diagonal(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse(), nthreads, in, out) ); + + CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); + CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); + +#if defined(__NVCC__) + // FIXME + // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda + CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); + typedef Matrix Matrix6f; + CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues(), nthreads, in, out) ); +#endif } diff --git a/test/gpu_common.h b/test/gpu_common.h new file mode 100644 index 000000000..79d4ea694 --- /dev/null +++ b/test/gpu_common.h @@ -0,0 +1,157 @@ + +#ifndef EIGEN_TEST_GPU_COMMON_H +#define EIGEN_TEST_GPU_COMMON_H + +#ifdef EIGEN_USE_HIP + #include + #include +#else + #include + #include + #include +#endif + +#include + +#define EIGEN_USE_GPU +#include + +#if !defined(__CUDACC__) && !defined(__HIPCC__) +dim3 threadIdx, blockDim, blockIdx; +#endif + +template +void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + for(int i=0; i +__global__ +void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + if(i +void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + typename Input::Scalar* d_in; + typename Output::Scalar* d_out; + std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar); + std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar); + + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); + + gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_out, out.data(), out_bytes, gpuMemcpyHostToDevice); + + // Simple and non-optimal 1D mapping assuming n is not too large + // That's only for unit testing! + dim3 Blocks(128); + dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) ); + + gpuDeviceSynchronize(); + +#ifdef EIGEN_USE_HIP + hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_gpu_meta_kernel::type, + typename std::decay::type>), + dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out); +#else + run_on_gpu_meta_kernel<<>>(ker, n, d_in, d_out); +#endif + + gpuDeviceSynchronize(); + + // check inputs have not been modified + gpuMemcpy(const_cast(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost); + gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost); + + gpuFree(d_in); + gpuFree(d_out); +} + + +template +void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& out) +{ + Input in_ref, in_gpu; + Output out_ref, out_gpu; + #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + in_ref = in_gpu = in; + out_ref = out_gpu = out; + #else + EIGEN_UNUSED_VARIABLE(in); + EIGEN_UNUSED_VARIABLE(out); + #endif + run_on_cpu (ker, n, in_ref, out_ref); + run_on_gpu(ker, n, in_gpu, out_gpu); + #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + VERIFY_IS_APPROX(in_ref, in_gpu); + VERIFY_IS_APPROX(out_ref, out_gpu); + #endif +} + +struct compile_time_device_info { + EIGEN_DEVICE_FUNC + void operator()(int /*i*/, const int* /*in*/, int* info) const + { + #if defined(__CUDA_ARCH__) + info[0] = int(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + } +}; + +void ei_test_init_gpu() +{ + int device = 0; + gpuDeviceProp_t deviceProp; + gpuGetDeviceProperties(&deviceProp, device); + + ArrayXi dummy(1), info(10); + info = -1; + run_on_gpu(compile_time_device_info(),10,dummy,info); + + + std::cout << "GPU compile-time info:\n"; + + #ifdef EIGEN_CUDACC + std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << "\n"; + #endif + + #ifdef EIGEN_CUDACC_VER + std::cout << " EIGEN_CUDACC_VER: " << int(EIGEN_CUDACC_VER) << "\n"; + #endif + + #ifdef EIGEN_HIPCC + std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << "\n"; + #endif + + std::cout << " EIGEN_CUDA_ARCH: " << info[0] << "\n"; + std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info[1] << "\n"; + + std::cout << "GPU device info:\n"; + std::cout << " name: " << deviceProp.name << "\n"; + std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n"; + std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n"; + std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n"; + std::cout << " warpSize: " << deviceProp.warpSize << "\n"; + std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n"; + std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n"; + std::cout << " clockRate: " << deviceProp.clockRate << "\n"; + std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n"; + std::cout << " computeMode: " << deviceProp.computeMode << "\n"; +} + +#endif // EIGEN_TEST_GPU_COMMON_H diff --git a/test/half_float.cpp b/test/half_float.cpp index 6f3196852..2a7f9b497 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -9,7 +9,7 @@ #include "main.h" -#include +#include // Make sure it's possible to forward declare Eigen::half namespace Eigen { @@ -20,7 +20,7 @@ using Eigen::half; void test_conversion() { - using Eigen::half_impl::__half; + using Eigen::half_impl::__half_raw; // Conversion from float. VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); @@ -37,9 +37,9 @@ void test_conversion() VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); + float val1 = float(half(__half_raw(0x3c00))); + float val2 = float(half(__half_raw(0x3c01))); + float val3 = float(half(__half_raw(0x3c02))); VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); @@ -55,21 +55,21 @@ void test_conversion() VERIFY_IS_EQUAL(half(true).x, 0x3c00); // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f); // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); + VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f); // NaNs and infinities. VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0xfc00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0xfc01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0x7c00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0x7c01))))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 @@ -79,12 +79,12 @@ void test_conversion() #endif // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); + VERIFY(!(numext::isinf)(half(__half_raw(0x7bff)))); + VERIFY(!(numext::isnan)(half(__half_raw(0x0000)))); + VERIFY((numext::isinf)(half(__half_raw(0xfc00)))); + VERIFY((numext::isnan)(half(__half_raw(0xfc01)))); + VERIFY((numext::isinf)(half(__half_raw(0x7c00)))); + VERIFY((numext::isnan)(half(__half_raw(0x7c01)))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 @@ -96,12 +96,24 @@ void test_conversion() void test_numtraits() { - std::cout << "epsilon = " << NumTraits::epsilon() << std::endl; - std::cout << "highest = " << NumTraits::highest() << std::endl; - std::cout << "lowest = " << NumTraits::lowest() << std::endl; - std::cout << "inifinty = " << NumTraits::infinity() << std::endl; - std::cout << "nan = " << NumTraits::quiet_NaN() << std::endl; + std::cout << "epsilon = " << NumTraits::epsilon() << " (0x" << std::hex << NumTraits::epsilon().x << ")" << std::endl; + std::cout << "highest = " << NumTraits::highest() << " (0x" << std::hex << NumTraits::highest().x << ")" << std::endl; + std::cout << "lowest = " << NumTraits::lowest() << " (0x" << std::hex << NumTraits::lowest().x << ")" << std::endl; + std::cout << "min = " << (std::numeric_limits::min)() << " (0x" << std::hex << half((std::numeric_limits::min)()).x << ")" << std::endl; + std::cout << "denorm min = " << (std::numeric_limits::denorm_min)() << " (0x" << std::hex << half((std::numeric_limits::denorm_min)()).x << ")" << std::endl; + std::cout << "infinity = " << NumTraits::infinity() << " (0x" << std::hex << NumTraits::infinity().x << ")" << std::endl; + std::cout << "quiet nan = " << NumTraits::quiet_NaN() << " (0x" << std::hex << NumTraits::quiet_NaN().x << ")" << std::endl; + std::cout << "signaling nan = " << std::numeric_limits::signaling_NaN() << " (0x" << std::hex << std::numeric_limits::signaling_NaN().x << ")" << std::endl; + VERIFY(NumTraits::IsSigned); + + VERIFY_IS_EQUAL( std::numeric_limits::infinity().x, half(std::numeric_limits::infinity()).x ); + VERIFY_IS_EQUAL( std::numeric_limits::quiet_NaN().x, half(std::numeric_limits::quiet_NaN()).x ); + VERIFY_IS_EQUAL( std::numeric_limits::signaling_NaN().x, half(std::numeric_limits::signaling_NaN()).x ); + VERIFY( (std::numeric_limits::min)() > half(0.f) ); + VERIFY( (std::numeric_limits::denorm_min)() > half(0.f) ); + VERIFY( (std::numeric_limits::min)()/half(2) > half(0.f) ); + VERIFY_IS_EQUAL( (std::numeric_limits::denorm_min)()/half(2), half(0.f) ); } void test_arithmetic() @@ -245,13 +257,31 @@ void test_array() ss << a1; } -void test_half_float() +void test_product() { - CALL_SUBTEST(test_conversion()); - CALL_SUBTEST(test_numtraits()); - CALL_SUBTEST(test_arithmetic()); - CALL_SUBTEST(test_comparison()); - CALL_SUBTEST(test_basic_functions()); - CALL_SUBTEST(test_trigonometric_functions()); - CALL_SUBTEST(test_array()); + typedef Matrix MatrixXh; + Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); + Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); + Index depth = internal::random(1,EIGEN_TEST_MAX_SIZE); + MatrixXh Ah = MatrixXh::Random(rows,depth); + MatrixXh Bh = MatrixXh::Random(depth,cols); + MatrixXh Ch = MatrixXh::Random(rows,cols); + MatrixXf Af = Ah.cast(); + MatrixXf Bf = Bh.cast(); + MatrixXf Cf = Ch.cast(); + VERIFY_IS_APPROX(Ch.noalias()+=Ah*Bh, (Cf.noalias()+=Af*Bf).cast()); +} + +EIGEN_DECLARE_TEST(half_float) +{ + CALL_SUBTEST(test_numtraits()); + for(int i = 0; i < g_repeat; i++) { + CALL_SUBTEST(test_conversion()); + CALL_SUBTEST(test_arithmetic()); + CALL_SUBTEST(test_comparison()); + CALL_SUBTEST(test_basic_functions()); + CALL_SUBTEST(test_trigonometric_functions()); + CALL_SUBTEST(test_array()); + CALL_SUBTEST(test_product()); + } } diff --git a/test/hessenberg.cpp b/test/hessenberg.cpp index 96bc19e2e..0e1b0098d 100644 --- a/test/hessenberg.cpp +++ b/test/hessenberg.cpp @@ -49,7 +49,7 @@ template void hessenberg(int size = Size) // TODO: Add tests for packedMatrix() and householderCoefficients() } -void test_hessenberg() +EIGEN_DECLARE_TEST(hessenberg) { CALL_SUBTEST_1(( hessenberg,1>() )); CALL_SUBTEST_2(( hessenberg,2>() )); diff --git a/test/householder.cpp b/test/householder.cpp index c5f6b5e4f..cad8138a2 100644 --- a/test/householder.cpp +++ b/test/householder.cpp @@ -12,7 +12,6 @@ template void householder(const MatrixType& m) { - typedef typename MatrixType::Index Index; static bool even = true; even = !even; /* this test covers the following files: @@ -49,6 +48,17 @@ template void householder(const MatrixType& m) v1.applyHouseholderOnTheLeft(essential,beta,tmp); VERIFY_IS_APPROX(v1.norm(), v2.norm()); + // reconstruct householder matrix: + SquareMatrixType id, H1, H2; + id.setIdentity(rows, rows); + H1 = H2 = id; + VectorType vv(rows); + vv << Scalar(1), essential; + H1.applyHouseholderOnTheLeft(essential, beta, tmp); + H2.applyHouseholderOnTheRight(essential, beta, tmp); + VERIFY_IS_APPROX(H1, H2); + VERIFY_IS_APPROX(H1, id - beta * vv*vv.adjoint()); + MatrixType m1(rows, cols), m2(rows, cols); @@ -69,7 +79,7 @@ template void householder(const MatrixType& m) m3.rowwise() = v1.transpose(); m4 = m3; m3.row(0).makeHouseholder(essential, beta, alpha); - m3.applyHouseholderOnTheRight(essential,beta,tmp); + m3.applyHouseholderOnTheRight(essential.conjugate(),beta,tmp); VERIFY_IS_APPROX(m3.norm(), m4.norm()); if(rows>=2) VERIFY_IS_MUCH_SMALLER_THAN(m3.block(0,1,rows,rows-1).norm(), m3.norm()); VERIFY_IS_MUCH_SMALLER_THAN(numext::imag(m3(0,0)), numext::real(m3(0,0))); @@ -104,14 +114,14 @@ template void householder(const MatrixType& m) VERIFY_IS_APPROX(hseq_mat.adjoint(), hseq_mat_adj); VERIFY_IS_APPROX(hseq_mat.conjugate(), hseq_mat_conj); VERIFY_IS_APPROX(hseq_mat.transpose(), hseq_mat_trans); - VERIFY_IS_APPROX(hseq_mat * m6, hseq_mat * m6); - VERIFY_IS_APPROX(hseq_mat.adjoint() * m6, hseq_mat_adj * m6); - VERIFY_IS_APPROX(hseq_mat.conjugate() * m6, hseq_mat_conj * m6); - VERIFY_IS_APPROX(hseq_mat.transpose() * m6, hseq_mat_trans * m6); - VERIFY_IS_APPROX(m6 * hseq_mat, m6 * hseq_mat); - VERIFY_IS_APPROX(m6 * hseq_mat.adjoint(), m6 * hseq_mat_adj); - VERIFY_IS_APPROX(m6 * hseq_mat.conjugate(), m6 * hseq_mat_conj); - VERIFY_IS_APPROX(m6 * hseq_mat.transpose(), m6 * hseq_mat_trans); + VERIFY_IS_APPROX(hseq * m6, hseq_mat * m6); + VERIFY_IS_APPROX(hseq.adjoint() * m6, hseq_mat_adj * m6); + VERIFY_IS_APPROX(hseq.conjugate() * m6, hseq_mat_conj * m6); + VERIFY_IS_APPROX(hseq.transpose() * m6, hseq_mat_trans * m6); + VERIFY_IS_APPROX(m6 * hseq, m6 * hseq_mat); + VERIFY_IS_APPROX(m6 * hseq.adjoint(), m6 * hseq_mat_adj); + VERIFY_IS_APPROX(m6 * hseq.conjugate(), m6 * hseq_mat_conj); + VERIFY_IS_APPROX(m6 * hseq.transpose(), m6 * hseq_mat_trans); // test householder sequence on the right with a shift @@ -123,7 +133,7 @@ template void householder(const MatrixType& m) VERIFY_IS_APPROX(m3 * m5, m1); // test evaluating rhseq to a dense matrix, then applying } -void test_householder() +EIGEN_DECLARE_TEST(householder) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( householder(Matrix()) ); diff --git a/test/incomplete_cholesky.cpp b/test/incomplete_cholesky.cpp index 59ffe9259..68fe7d507 100644 --- a/test/incomplete_cholesky.cpp +++ b/test/incomplete_cholesky.cpp @@ -29,14 +29,10 @@ template void test_incomplete_cholesky_T() CALL_SUBTEST( check_sparse_spd_solving(cg_illt_uplo_amd) ); } -void test_incomplete_cholesky() +template +void bug1150() { - CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); - CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); - CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); - -#ifdef EIGEN_TEST_PART_1 - // regression for bug 1150 + // regression for bug 1150 for(int N = 1; N<20; ++N) { Eigen::MatrixXd b( N, N ); @@ -61,5 +57,13 @@ void test_incomplete_cholesky() VERIFY(solver.preconditioner().info() == Eigen::Success); VERIFY(solver.info() == Eigen::Success); } -#endif +} + +EIGEN_DECLARE_TEST(incomplete_cholesky) +{ + CALL_SUBTEST_1(( test_incomplete_cholesky_T() )); + CALL_SUBTEST_2(( test_incomplete_cholesky_T, int>() )); + CALL_SUBTEST_3(( test_incomplete_cholesky_T() )); + + CALL_SUBTEST_1(( bug1150<0>() )); } diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp index 7245cf378..9a284cf0a 100644 --- a/test/indexed_view.cpp +++ b/test/indexed_view.cpp @@ -77,12 +77,11 @@ is_same_seq_type(const T1& a, const T2& b) #define VERIFY_EQ_INT(A,B) VERIFY_IS_APPROX(int(A),int(B)) +// C++03 does not allow local or unnamed enums as index +enum DummyEnum { XX=0, YY=1 }; + void check_indexed_view() { - using Eigen::placeholders::all; - using Eigen::placeholders::last; - using Eigen::placeholders::end; - Index n = 10; ArrayXd a = ArrayXd::LinSpaced(n,0,n-1); @@ -140,7 +139,7 @@ void check_indexed_view() "500 501 502 503 504 505 506 507 508 509") ); - // takes the row numer 3, and repeat it 5 times + // take row number 3, and repeat it 5 times VERIFY( MATCH( A(seqN(3,5,0), all), "300 301 302 303 304 305 306 307 308 309\n" "300 301 302 303 304 305 306 307 308 309\n" @@ -236,7 +235,7 @@ void check_indexed_view() VERIFY_IS_APPROX( A(seq(n-1,2,-2), seqN(n-1-6,4)), A(seq(last,2,-2), seqN(last-6,4)) ); VERIFY_IS_APPROX( A(seq(n-1-6,n-1-2), seqN(n-1-6,4)), A(seq(last-6,last-2), seqN(6+last-6-6,4)) ); VERIFY_IS_APPROX( A(seq((n-1)/2,(n)/2+3), seqN(2,4)), A(seq(last/2,(last+1)/2+3), seqN(last+2-last,4)) ); - VERIFY_IS_APPROX( A(seq(n-2,2,-2), seqN(n-8,4)), A(seq(end-2,2,-2), seqN(end-8,4)) ); + VERIFY_IS_APPROX( A(seq(n-2,2,-2), seqN(n-8,4)), A(seq(lastp1-2,2,-2), seqN(lastp1-8,4)) ); // Check all combinations of seq: VERIFY_IS_APPROX( A(seq(1,n-1-2,2), seq(1,n-1-2,2)), A(seq(1,last-2,2), seq(1,last-2,fix<2>)) ); @@ -246,7 +245,7 @@ void check_indexed_view() VERIFY_IS_APPROX( A(seq(n-1-5,n-1-2), seq(n-1-5,n-1-2)), A(seq(last-5,last-2), seq(last-5,last-2)) ); VERIFY_IS_APPROX( A.col(A.cols()-1), A(all,last) ); - VERIFY_IS_APPROX( A(A.rows()-2, A.cols()/2), A(last-1, end/2) ); + VERIFY_IS_APPROX( A(A.rows()-2, A.cols()/2), A(last-1, lastp1/2) ); VERIFY_IS_APPROX( a(a.size()-2), a(last-1) ); VERIFY_IS_APPROX( a(a.size()/2), a((last+1)/2) ); @@ -269,7 +268,7 @@ void check_indexed_view() VERIFY( is_same_eq(a.head(4), a(seq(0,3))) ); VERIFY( is_same_eq(a.tail(4), a(seqN(last-3,4))) ); - VERIFY( is_same_eq(a.tail(4), a(seq(end-4,last))) ); + VERIFY( is_same_eq(a.tail(4), a(seq(lastp1-4,last))) ); VERIFY( is_same_eq(a.segment<4>(3), a(seqN(3,fix<4>))) ); } @@ -293,6 +292,14 @@ void check_indexed_view() } #if EIGEN_HAS_CXX11 + // check lastN + VERIFY_IS_APPROX( a(lastN(3)), a.tail(3) ); + VERIFY( MATCH( a(lastN(3)), "7\n8\n9" ) ); + VERIFY_IS_APPROX( a(lastN(fix<3>())), a.tail<3>() ); + VERIFY( MATCH( a(lastN(3,2)), "5\n7\n9" ) ); + VERIFY( MATCH( a(lastN(3,fix<2>())), "5\n7\n9" ) ); + VERIFY( a(lastN(fix<3>())).SizeAtCompileTime == 3 ); + VERIFY( (A(all, std::array{{1,3,2,4}})).ColsAtCompileTime == 4); VERIFY_IS_APPROX( (A(std::array{{1,3,5}}, std::array{{9,6,3,0}})), A(seqN(1,3,2), seqN(9,4,-3)) ); @@ -366,13 +373,51 @@ void check_indexed_view() VERIFY( is_same_eq( cA.middleRows<3>(1), cA.middleRows(1,fix<3>)) ); } + // Check compilation of enums as index type: + a(XX) = 1; + A(XX,YY) = 1; + // Anonymous enums only work with C++11 +#if EIGEN_HAS_CXX11 + enum { X=0, Y=1 }; + a(X) = 1; + A(X,Y) = 1; + A(XX,Y) = 1; + A(X,YY) = 1; +#endif + + // Check compilation of varying integer types as index types: + Index i = n/2; + short i_short(i); + std::size_t i_sizet(i); + VERIFY_IS_EQUAL( a(i), a.coeff(i_short) ); + VERIFY_IS_EQUAL( a(i), a.coeff(i_sizet) ); + + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_short, i) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i, i_sizet) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(i_sizet, i_short) ); + VERIFY_IS_EQUAL( A(i,i), A.coeff(5, i_sizet) ); + } -void test_indexed_view() +EIGEN_DECLARE_TEST(indexed_view) { // for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_indexed_view() ); CALL_SUBTEST_2( check_indexed_view() ); CALL_SUBTEST_3( check_indexed_view() ); // } + + // static checks of some internals: + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( internal::is_valid_index_type::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); + STATIC_CHECK(( !internal::valid_indexed_view_overload::value )); } diff --git a/test/inplace_decomposition.cpp b/test/inplace_decomposition.cpp index 92d0d91b6..e3aa9957d 100644 --- a/test/inplace_decomposition.cpp +++ b/test/inplace_decomposition.cpp @@ -79,7 +79,7 @@ template void inplace(bool square = false, } -void test_inplace_decomposition() +EIGEN_DECLARE_TEST(inplace_decomposition) { EIGEN_UNUSED typedef Matrix Matrix43d; for(int i = 0; i < g_repeat; i++) { diff --git a/test/integer_types.cpp b/test/integer_types.cpp index a21f73a81..3f9030d77 100644 --- a/test/integer_types.cpp +++ b/test/integer_types.cpp @@ -18,7 +18,6 @@ template void signed_integer_type_tests(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; enum { is_signed = (Scalar(-1) > Scalar(0)) ? 0 : 1 }; @@ -49,7 +48,6 @@ template void signed_integer_type_tests(const MatrixType& m template void integer_type_tests(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; VERIFY(NumTraits::IsInteger); @@ -133,7 +131,18 @@ template void integer_type_tests(const MatrixType& m) VERIFY_IS_APPROX((m1 * m2.transpose()) * m1, m1 * (m2.transpose() * m1)); } -void test_integer_types() +template +void integer_types_extra() +{ + VERIFY_IS_EQUAL(int(internal::scalar_div_cost::value), 8); + VERIFY_IS_EQUAL(int(internal::scalar_div_cost::value), 8); + if(sizeof(long)>sizeof(int)) { + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); + VERIFY(int(internal::scalar_div_cost::value) > int(internal::scalar_div_cost::value)); + } +} + +EIGEN_DECLARE_TEST(integer_types) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( integer_type_tests(Matrix()) ); @@ -158,12 +167,5 @@ void test_integer_types() CALL_SUBTEST_8( integer_type_tests(Matrix(1, 5)) ); } -#ifdef EIGEN_TEST_PART_9 - VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); - VERIFY_IS_EQUAL(internal::scalar_div_cost::value, 8); - if(sizeof(long)>sizeof(int)) { - VERIFY(internal::scalar_div_cost::value > internal::scalar_div_cost::value); - VERIFY(internal::scalar_div_cost::value > internal::scalar_div_cost::value); - } -#endif + CALL_SUBTEST_9( integer_types_extra<0>() ); } diff --git a/test/inverse.cpp b/test/inverse.cpp index 5c6777a18..8754cb7e5 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -11,10 +11,59 @@ #include "main.h" #include -template void inverse(const MatrixType& m) +template +void inverse_for_fixed_size(const MatrixType&, typename internal::enable_if::type* = 0) +{ +} + +template +void inverse_for_fixed_size(const MatrixType& m1, typename internal::enable_if::type* = 0) { using std::abs; - typedef typename MatrixType::Index Index; + + MatrixType m2, identity = MatrixType::Identity(); + + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Matrix VectorType; + + //computeInverseAndDetWithCheck tests + //First: an invertible matrix + bool invertible; + Scalar det; + + m2.setZero(); + m1.computeInverseAndDetWithCheck(m2, det, invertible); + VERIFY(invertible); + VERIFY_IS_APPROX(identity, m1*m2); + VERIFY_IS_APPROX(det, m1.determinant()); + + m2.setZero(); + m1.computeInverseWithCheck(m2, invertible); + VERIFY(invertible); + VERIFY_IS_APPROX(identity, m1*m2); + + //Second: a rank one matrix (not invertible, except for 1x1 matrices) + VectorType v3 = VectorType::Random(); + MatrixType m3 = v3*v3.transpose(), m4; + m3.computeInverseAndDetWithCheck(m4, det, invertible); + VERIFY( m1.rows()==1 ? invertible : !invertible ); + VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1)); + m3.computeInverseWithCheck(m4, invertible); + VERIFY( m1.rows()==1 ? invertible : !invertible ); + + // check with submatrices + { + Matrix m5; + m5.setRandom(); + m5.topLeftCorner(m1.rows(),m1.rows()) = m1; + m2 = m5.template topLeftCorner().inverse(); + VERIFY_IS_APPROX( (m5.template topLeftCorner()), m2.inverse() ); + } +} + +template void inverse(const MatrixType& m) +{ /* this test covers the following files: Inverse.h */ @@ -40,44 +89,7 @@ template void inverse(const MatrixType& m) // since for the general case we implement separately row-major and col-major, test that VERIFY_IS_APPROX(MatrixType(m1.transpose().inverse()), MatrixType(m1.inverse().transpose())); -#if !defined(EIGEN_TEST_PART_5) && !defined(EIGEN_TEST_PART_6) - typedef typename NumTraits::Real RealScalar; - typedef Matrix VectorType; - - //computeInverseAndDetWithCheck tests - //First: an invertible matrix - bool invertible; - RealScalar det; - - m2.setZero(); - m1.computeInverseAndDetWithCheck(m2, det, invertible); - VERIFY(invertible); - VERIFY_IS_APPROX(identity, m1*m2); - VERIFY_IS_APPROX(det, m1.determinant()); - - m2.setZero(); - m1.computeInverseWithCheck(m2, invertible); - VERIFY(invertible); - VERIFY_IS_APPROX(identity, m1*m2); - - //Second: a rank one matrix (not invertible, except for 1x1 matrices) - VectorType v3 = VectorType::Random(rows); - MatrixType m3 = v3*v3.transpose(), m4(rows,cols); - m3.computeInverseAndDetWithCheck(m4, det, invertible); - VERIFY( rows==1 ? invertible : !invertible ); - VERIFY_IS_MUCH_SMALLER_THAN(abs(det-m3.determinant()), RealScalar(1)); - m3.computeInverseWithCheck(m4, invertible); - VERIFY( rows==1 ? invertible : !invertible ); - - // check with submatrices - { - Matrix m5; - m5.setRandom(); - m5.topLeftCorner(rows,rows) = m1; - m2 = m5.template topLeftCorner().inverse(); - VERIFY_IS_APPROX( (m5.template topLeftCorner()), m2.inverse() ); - } -#endif + inverse_for_fixed_size(m1); // check in-place inversion if(MatrixType::RowsAtCompileTime>=2 && MatrixType::RowsAtCompileTime<=4) @@ -93,7 +105,7 @@ template void inverse(const MatrixType& m) } } -void test_inverse() +EIGEN_DECLARE_TEST(inverse) { int s = 0; for(int i = 0; i < g_repeat; i++) { @@ -113,5 +125,7 @@ void test_inverse() CALL_SUBTEST_7( inverse(Matrix4d()) ); CALL_SUBTEST_7( inverse(Matrix()) ); + + CALL_SUBTEST_8( inverse(Matrix4cd()) ); } } diff --git a/test/is_same_dense.cpp b/test/is_same_dense.cpp index 2c7838ce9..23dd806eb 100644 --- a/test/is_same_dense.cpp +++ b/test/is_same_dense.cpp @@ -11,12 +11,16 @@ using internal::is_same_dense; -void test_is_same_dense() +EIGEN_DECLARE_TEST(is_same_dense) { typedef Matrix ColMatrixXd; + typedef Matrix,Dynamic,Dynamic,ColMajor> ColMatrixXcd; ColMatrixXd m1(10,10); + ColMatrixXcd m2(10,10); Ref ref_m1(m1); + Ref > ref_m2_real(m2.real()); Ref const_ref_m1(m1); + VERIFY(is_same_dense(m1,m1)); VERIFY(is_same_dense(m1,ref_m1)); VERIFY(is_same_dense(const_ref_m1,m1)); @@ -30,4 +34,8 @@ void test_is_same_dense() Ref const_ref_m1_col(m1.col(1)); VERIFY(is_same_dense(m1.col(1),const_ref_m1_col)); + + + VERIFY(!is_same_dense(m1, ref_m2_real)); + VERIFY(!is_same_dense(m2, ref_m2_real)); } diff --git a/test/jacobi.cpp b/test/jacobi.cpp index 7ccd4124b..5604797f5 100644 --- a/test/jacobi.cpp +++ b/test/jacobi.cpp @@ -14,7 +14,6 @@ template void jacobi(const MatrixType& m = MatrixType()) { - typedef typename MatrixType::Index Index; Index rows = m.rows(); Index cols = m.cols(); @@ -58,7 +57,7 @@ void jacobi(const MatrixType& m = MatrixType()) } } -void test_jacobi() +EIGEN_DECLARE_TEST(jacobi) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(( jacobi() )); diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 7f5f71562..f9a59e0e7 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -36,7 +36,6 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true) template void jacobisvd_verify_assert(const MatrixType& m) { svd_verify_assert >(m); - typedef typename MatrixType::Index Index; Index rows = m.rows(); Index cols = m.cols(); @@ -70,7 +69,7 @@ void jacobisvd_method() VERIFY_IS_APPROX(m.jacobiSvd(ComputeFullU|ComputeFullV).solve(m), m); } -void test_jacobisvd() +EIGEN_DECLARE_TEST(jacobisvd) { CALL_SUBTEST_3(( jacobisvd_verify_assert(Matrix3f()) )); CALL_SUBTEST_4(( jacobisvd_verify_assert(Matrix4d()) )); diff --git a/test/klu_support.cpp b/test/klu_support.cpp new file mode 100644 index 000000000..f806ad50e --- /dev/null +++ b/test/klu_support.cpp @@ -0,0 +1,32 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS +#include "sparse_solver.h" + +#include + +template void test_klu_support_T() +{ + KLU > klu_colmajor; + KLU > klu_rowmajor; + + check_sparse_square_solving(klu_colmajor); + check_sparse_square_solving(klu_rowmajor); + + //check_sparse_square_determinant(umfpack_colmajor); + //check_sparse_square_determinant(umfpack_rowmajor); +} + +EIGEN_DECLARE_TEST(klu_support) +{ + CALL_SUBTEST_1(test_klu_support_T()); + CALL_SUBTEST_2(test_klu_support_T >()); +} + diff --git a/test/linearstructure.cpp b/test/linearstructure.cpp index 17474af10..46ee5162b 100644 --- a/test/linearstructure.cpp +++ b/test/linearstructure.cpp @@ -19,7 +19,6 @@ template void linearStructure(const MatrixType& m) /* this test covers the following files: CwiseUnaryOp.h, CwiseBinaryOp.h, SelfCwiseBinaryOp.h */ - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; @@ -111,7 +110,20 @@ template void real_complex(DenseIndex rows = MatrixType::Ro VERIFY(g_called && "matrix - real not properly optimized"); } -void test_linearstructure() +template +void linearstructure_overflow() +{ + // make sure that /=scalar and /scalar do not overflow + // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not + Matrix4d m2, m3; + m3 = m2 = Matrix4d::Random()*1e-20; + m2 = m2 / 4.9e-320; + VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones()); + m3 /= 4.9e-320; + VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones()); +} + +EIGEN_DECLARE_TEST(linearstructure) { g_called = true; VERIFY(g_called); // avoid `unneeded-internal-declaration` warning. @@ -131,19 +143,5 @@ void test_linearstructure() CALL_SUBTEST_11( real_complex(10,10) ); CALL_SUBTEST_11( real_complex(10,10) ); } - -#ifdef EIGEN_TEST_PART_4 - { - // make sure that /=scalar and /scalar do not overflow - // rational: 1.0/4.94e-320 overflow, but m/4.94e-320 should not - Matrix4d m2, m3; - m3 = m2 = Matrix4d::Random()*1e-20; - m2 = m2 / 4.9e-320; - VERIFY_IS_APPROX(m2.cwiseQuotient(m2), Matrix4d::Ones()); - m3 /= 4.9e-320; - VERIFY_IS_APPROX(m3.cwiseQuotient(m3), Matrix4d::Ones()); - - - } -#endif + CALL_SUBTEST_4( linearstructure_overflow<0>() ); } diff --git a/test/lscg.cpp b/test/lscg.cpp index daa62a954..feb2347a8 100644 --- a/test/lscg.cpp +++ b/test/lscg.cpp @@ -14,15 +14,23 @@ template void test_lscg_T() { LeastSquaresConjugateGradient > lscg_colmajor_diag; LeastSquaresConjugateGradient, IdentityPreconditioner> lscg_colmajor_I; + LeastSquaresConjugateGradient > lscg_rowmajor_diag; + LeastSquaresConjugateGradient, IdentityPreconditioner> lscg_rowmajor_I; CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_diag) ); CALL_SUBTEST( check_sparse_square_solving(lscg_colmajor_I) ); CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_diag) ); CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_colmajor_I) ); + + CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_diag) ); + CALL_SUBTEST( check_sparse_square_solving(lscg_rowmajor_I) ); + + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_diag) ); + CALL_SUBTEST( check_sparse_leastsquare_solving(lscg_rowmajor_I) ); } -void test_lscg() +EIGEN_DECLARE_TEST(lscg) { CALL_SUBTEST_1(test_lscg_T()); CALL_SUBTEST_2(test_lscg_T >()); diff --git a/test/lu.cpp b/test/lu.cpp index 9787f4d86..144496e91 100644 --- a/test/lu.cpp +++ b/test/lu.cpp @@ -18,7 +18,6 @@ typename MatrixType::RealScalar matrix_l1_norm(const MatrixType& m) { template void lu_non_invertible() { - typedef typename MatrixType::Index Index; typedef typename MatrixType::RealScalar RealScalar; /* this test covers the following files: LU.h @@ -181,7 +180,6 @@ template void lu_partial_piv() /* this test covers the following files: PartialPivLU.h */ - typedef typename MatrixType::Index Index; typedef typename NumTraits::Real RealScalar; Index size = internal::random(1,4); @@ -244,7 +242,7 @@ template void lu_verify_assert() VERIFY_RAISES_ASSERT(plu.inverse()) } -void test_lu() +EIGEN_DECLARE_TEST(lu) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( lu_non_invertible() ); diff --git a/test/main.h b/test/main.h index 25d2dcf43..36784b1f4 100644 --- a/test/main.h +++ b/test/main.h @@ -50,15 +50,44 @@ #endif #endif +// Same for cuda_fp16.h +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_TEST_CUDACC_VER 0 +#endif + +#if EIGEN_TEST_CUDACC_VER >= 70500 +#include +#endif + // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a // compiler error. -#define min(A,B) please_protect_your_min_with_parentheses -#define max(A,B) please_protect_your_max_with_parentheses -#define isnan(X) please_protect_your_isnan_with_parentheses -#define isinf(X) please_protect_your_isinf_with_parentheses -#define isfinite(X) please_protect_your_isfinite_with_parentheses +#if !defined(__HIPCC__) + // + // HIP header files include the following files + // + // + // + // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail + // + // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue + // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other + // headers. + // + // So instead choosing to simply disable this check for HIP + // + #define min(A,B) please_protect_your_min_with_parentheses + #define max(A,B) please_protect_your_max_with_parentheses + #define isnan(X) please_protect_your_isnan_with_parentheses + #define isinf(X) please_protect_your_isinf_with_parentheses + #define isfinite(X) please_protect_your_isfinite_with_parentheses +#endif + #ifdef M_PI #undef M_PI #endif @@ -93,13 +122,12 @@ inline void on_temporary_creation(long int size) { #define VERIFY_EVALUATION_COUNT(XPR,N) {\ nb_temporaries = 0; \ XPR; \ - if(nb_temporaries!=N) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\ - VERIFY( (#XPR) && nb_temporaries==N ); \ + if(nb_temporaries!=(N)) { std::cerr << "nb_temporaries == " << nb_temporaries << "\n"; }\ + VERIFY( (#XPR) && nb_temporaries==(N) ); \ } - + #endif -// the following file is automatically generated by cmake #include "split_test_helper.h" #ifdef NDEBUG @@ -116,10 +144,6 @@ inline void on_temporary_creation(long int size) { #define EIGEN_MAKING_DOCS #endif -#ifndef EIGEN_TEST_FUNC -#error EIGEN_TEST_FUNC must be defined -#endif - #define DEFAULT_REPEAT 10 namespace Eigen @@ -128,20 +152,48 @@ namespace Eigen // level == 0 <=> abort if test fail // level >= 1 <=> warning message to std::cerr if test fail static int g_test_level = 0; - static int g_repeat; - static unsigned int g_seed; - static bool g_has_set_repeat, g_has_set_seed; + static int g_repeat = 1; + static unsigned int g_seed = 0; + static bool g_has_set_repeat = false, g_has_set_seed = false; + + class EigenTest + { + public: + EigenTest() : m_func(0) {} + EigenTest(const char* a_name, void (*func)(void)) + : m_name(a_name), m_func(func) + { + ms_registered_tests.push_back(this); + } + const std::string& name() const { return m_name; } + void operator()() const { m_func(); } + + static const std::vector& all() { return ms_registered_tests; } + protected: + std::string m_name; + void (*m_func)(void); + static std::vector ms_registered_tests; + }; + + std::vector EigenTest::ms_registered_tests; + + // Declare and register a test, e.g.: + // EIGEN_DECLARE_TEST(mytest) { ... } + // will create a function: + // void test_mytest() { ... } + // that will be automatically called. + #define EIGEN_DECLARE_TEST(X) \ + void EIGEN_CAT(test_,X) (); \ + static EigenTest EIGEN_CAT(test_handler_,X) (EIGEN_MAKESTRING(X), & EIGEN_CAT(test_,X)); \ + void EIGEN_CAT(test_,X) () } #define TRACK std::cerr << __FILE__ << " " << __LINE__ << std::endl // #define TRACK while() -#define EI_PP_MAKE_STRING2(S) #S -#define EI_PP_MAKE_STRING(S) EI_PP_MAKE_STRING2(S) - #define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "") -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) #define EIGEN_EXCEPTIONS #endif @@ -162,9 +214,15 @@ namespace Eigen eigen_assert_exception(void) {} ~eigen_assert_exception() { Eigen::no_more_assert = false; } }; + + struct eigen_static_assert_exception + { + eigen_static_assert_exception(void) {} + ~eigen_static_assert_exception() { Eigen::no_more_assert = false; } + }; } // If EIGEN_DEBUG_ASSERTS is defined and if no assertion is triggered while - // one should have been, then the list of excecuted assertions is printed out. + // one should have been, then the list of executed assertions is printed out. // // EIGEN_DEBUG_ASSERTS is not enabled by default as it // significantly increases the compilation time @@ -190,7 +248,7 @@ namespace Eigen } \ else if (Eigen::internal::push_assert) \ { \ - eigen_assert_list.push_back(std::string(EI_PP_MAKE_STRING(__FILE__) " (" EI_PP_MAKE_STRING(__LINE__) ") : " #a) ); \ + eigen_assert_list.push_back(std::string(EIGEN_MAKESTRING(__FILE__) " (" EIGEN_MAKESTRING(__LINE__) ") : " #a) ); \ } #ifdef EIGEN_EXCEPTIONS @@ -214,7 +272,7 @@ namespace Eigen } #endif //EIGEN_EXCEPTIONS - #elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS + #elif !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) // EIGEN_DEBUG_ASSERTS // see bug 89. The copy_bool here is working around a bug in gcc <= 4.3 #define eigen_assert(a) \ if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ @@ -225,6 +283,7 @@ namespace Eigen else \ EIGEN_THROW_X(Eigen::eigen_assert_exception()); \ } + #ifdef EIGEN_EXCEPTIONS #define VERIFY_RAISES_ASSERT(a) { \ Eigen::no_more_assert = false; \ @@ -236,25 +295,51 @@ namespace Eigen catch (Eigen::eigen_assert_exception&) { VERIFY(true); } \ Eigen::report_on_cerr_on_assert_failure = true; \ } - #endif //EIGEN_EXCEPTIONS + #endif // EIGEN_EXCEPTIONS #endif // EIGEN_DEBUG_ASSERTS + #if defined(TEST_CHECK_STATIC_ASSERTIONS) && defined(EIGEN_EXCEPTIONS) + #define EIGEN_STATIC_ASSERT(a,MSG) \ + if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\ + { \ + Eigen::no_more_assert = true; \ + if(report_on_cerr_on_assert_failure) \ + eigen_plain_assert((a) && #MSG); \ + else \ + EIGEN_THROW_X(Eigen::eigen_static_assert_exception()); \ + } + #define VERIFY_RAISES_STATIC_ASSERT(a) { \ + Eigen::no_more_assert = false; \ + Eigen::report_on_cerr_on_assert_failure = false; \ + try { \ + a; \ + VERIFY(Eigen::should_raise_an_assert && # a); \ + } \ + catch (Eigen::eigen_static_assert_exception&) { VERIFY(true); } \ + Eigen::report_on_cerr_on_assert_failure = true; \ + } + #endif // TEST_CHECK_STATIC_ASSERTIONS + #ifndef VERIFY_RAISES_ASSERT #define VERIFY_RAISES_ASSERT(a) \ std::cout << "Can't VERIFY_RAISES_ASSERT( " #a " ) with exceptions disabled\n"; #endif - - #if !defined(__CUDACC__) +#ifndef VERIFY_RAISES_STATIC_ASSERT + #define VERIFY_RAISES_STATIC_ASSERT(a) \ + std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n"; +#endif + + #if !defined(__CUDACC__) && !defined(__HIPCC__) && !defined(__SYCL_DEVICE_ONLY__) #define EIGEN_USE_CUSTOM_ASSERT #endif #else // EIGEN_NO_ASSERTION_CHECKING #define VERIFY_RAISES_ASSERT(a) {} + #define VERIFY_RAISES_STATIC_ASSERT(a) {} #endif // EIGEN_NO_ASSERTION_CHECKING - #define EIGEN_INTERNAL_DEBUGGING #include // required for createRandomPIMatrixOfRank @@ -276,10 +361,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file, } } -#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a)) +#define VERIFY(a) ::verify_impl(a, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a)) -#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a >= b)) -#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EI_PP_MAKE_STRING(a <= b)) +#define VERIFY_GE(a, b) ::verify_impl(a >= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a >= b)) +#define VERIFY_LE(a, b) ::verify_impl(a <= b, g_test_stack.back().c_str(), __FILE__, __LINE__, EIGEN_MAKESTRING(a <= b)) #define VERIFY_IS_EQUAL(a, b) VERIFY(test_is_equal(a, b, true)) @@ -293,8 +378,10 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a)) +#define STATIC_CHECK(COND) EIGEN_STATIC_ASSERT( (COND) , EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT ) + #define CALL_SUBTEST(FUNC) do { \ - g_test_stack.push_back(EI_PP_MAKE_STRING(FUNC)); \ + g_test_stack.push_back(EIGEN_MAKESTRING(FUNC)); \ FUNC; \ g_test_stack.pop_back(); \ } while (0) @@ -310,6 +397,17 @@ template<> inline float test_precision >() { return test_pre template<> inline double test_precision >() { return test_precision(); } template<> inline long double test_precision >() { return test_precision(); } +inline bool test_isApprox(const short& a, const short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned short& a, const unsigned short& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned int& a, const unsigned int& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const long& a, const long& b) +{ return internal::isApprox(a, b, test_precision()); } +inline bool test_isApprox(const unsigned long& a, const unsigned long& b) +{ return internal::isApprox(a, b, test_precision()); } + inline bool test_isApprox(const int& a, const int& b) { return internal::isApprox(a, b, test_precision()); } inline bool test_isMuchSmallerThan(const int& a, const int& b) @@ -634,9 +732,6 @@ template<> std::string type_name >() { return "comple template<> std::string type_name >() { return "complex"; } template<> std::string type_name >() { return "complex"; } -// forward declaration of the main test function -void EIGEN_CAT(test_,EIGEN_TEST_FUNC)(); - using namespace Eigen; inline void set_repeat_from_string(const char *str) @@ -723,9 +818,16 @@ int main(int argc, char *argv[]) srand(g_seed); std::cout << "Repeating each test " << g_repeat << " times" << std::endl; - Eigen::g_test_stack.push_back(std::string(EI_PP_MAKE_STRING(EIGEN_TEST_FUNC))); + VERIFY(EigenTest::all().size()>0); + + for(std::size_t i=0; i void map_class_vector(const VectorType& m) { - typedef typename VectorType::Index Index; typedef typename VectorType::Scalar Scalar; Index size = m.size(); @@ -51,7 +50,6 @@ template void map_class_vector(const VectorType& m) template void map_class_matrix(const MatrixType& m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(), cols = m.cols(), size = rows*cols; @@ -64,8 +62,9 @@ template void map_class_matrix(const MatrixType& m) for(int i = 0; i < size; i++) array2[i] = Scalar(1); // array3unaligned -> unaligned pointer to heap Scalar* array3 = new Scalar[size+1]; - for(int i = 0; i < size+1; i++) array3[i] = Scalar(1); - Scalar* array3unaligned = internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES == 0 ? array3+1 : array3; + Index sizep1 = size + 1; // <- without this temporary MSVC 2103 generates bad code + for(Index i = 0; i < sizep1; i++) array3[i] = Scalar(1); + Scalar* array3unaligned = (internal::UIntPtr(array3)%EIGEN_MAX_ALIGN_BYTES) == 0 ? array3+1 : array3; Scalar array4[256]; if(size<=256) for(int i = 0; i < size; i++) array4[i] = Scalar(1); @@ -121,7 +120,6 @@ template void map_class_matrix(const MatrixType& m) template void map_static_methods(const VectorType& m) { - typedef typename VectorType::Index Index; typedef typename VectorType::Scalar Scalar; Index size = m.size(); @@ -163,7 +161,6 @@ template void map_not_aligned_on_scalar() { typedef Matrix MatrixType; - typedef typename MatrixType::Index Index; Index size = 11; Scalar* array1 = internal::aligned_new((size+1)*(size+1)+1); Scalar* array2 = reinterpret_cast(sizeof(Scalar)/2+std::size_t(array1)); @@ -181,7 +178,7 @@ void map_not_aligned_on_scalar() internal::aligned_delete(array1, (size+1)*(size+1)+1); } -void test_mapped_matrix() +EIGEN_DECLARE_TEST(mapped_matrix) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( map_class_vector(Matrix()) ); @@ -205,7 +202,6 @@ void test_mapped_matrix() CALL_SUBTEST_8( map_static_methods(RowVector3d()) ); CALL_SUBTEST_9( map_static_methods(VectorXcd(8)) ); CALL_SUBTEST_10( map_static_methods(VectorXf(12)) ); - CALL_SUBTEST_11( map_not_aligned_on_scalar() ); } } diff --git a/test/mapstaticmethods.cpp b/test/mapstaticmethods.cpp index 06272d106..d0128ba94 100644 --- a/test/mapstaticmethods.cpp +++ b/test/mapstaticmethods.cpp @@ -9,8 +9,12 @@ #include "main.h" +// GCC<=4.8 has spurious shadow warnings, because `ptr` re-appears inside template instantiations +// workaround: put these in an anonymous namespace +namespace { float *ptr; const float *const_ptr; +} template { static void run(const PlainObjectType& m) { - typedef typename PlainObjectType::Index Index; Index rows = m.rows(), cols = m.cols(); int i = internal::random(2,5), j = internal::random(2,5); @@ -116,7 +119,6 @@ struct mapstaticmethods_impl { static void run(const PlainObjectType& v) { - typedef typename PlainObjectType::Index Index; Index size = v.size(); int i = internal::random(2,5); @@ -145,7 +147,7 @@ void mapstaticmethods(const PlainObjectType& m) VERIFY(true); // just to avoid 'unused function' warning } -void test_mapstaticmethods() +EIGEN_DECLARE_TEST(mapstaticmethods) { ptr = internal::aligned_new(1000); for(int i = 0; i < 1000; i++) ptr[i] = float(i); diff --git a/test/mapstride.cpp b/test/mapstride.cpp index 4858f8fea..09196600b 100644 --- a/test/mapstride.cpp +++ b/test/mapstride.cpp @@ -11,7 +11,6 @@ template void map_class_vector(const VectorType& m) { - typedef typename VectorType::Index Index; typedef typename VectorType::Scalar Scalar; Index size = m.size(); @@ -50,7 +49,6 @@ template void map_class_vector(const VectorTy template void map_class_matrix(const MatrixType& _m) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = _m.rows(), cols = _m.cols(); @@ -58,7 +56,7 @@ template void map_class_matrix(const MatrixTy MatrixType m = MatrixType::Random(rows,cols); Scalar s1 = internal::random(); - Index arraysize = 2*(rows+4)*(cols+4); + Index arraysize = 4*(rows+4)*(cols+4); Scalar* a_array1 = internal::aligned_new(arraysize+1); Scalar* array1 = a_array1; @@ -143,10 +141,63 @@ template void map_class_matrix(const MatrixTy VERIFY_IS_APPROX(map,s1*m); } + // test inner stride and no outer stride + for(int k=0; k<2; ++k) + { + if(k==1 && (m.innerSize()*2)*m.outerSize() > maxsize2) + break; + Scalar* array = (k==0 ? array1 : array2); + + Map > map(array, rows, cols, InnerStride(2)); + map = m; + VERIFY(map.outerStride() == map.innerSize()*2); + for(int i = 0; i < m.outerSize(); ++i) + for(int j = 0; j < m.innerSize(); ++j) + { + VERIFY(array[map.innerSize()*i*2+j*2] == m.coeffByOuterInner(i,j)); + VERIFY(map.coeffByOuterInner(i,j) == m.coeffByOuterInner(i,j)); + } + VERIFY_IS_APPROX(s1*map,s1*m); + map *= s1; + VERIFY_IS_APPROX(map,s1*m); + } + internal::aligned_delete(a_array1, arraysize+1); } -void test_mapstride() +// Additional tests for inner-stride but no outer-stride +template +void bug1453() +{ + const int data[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + typedef Matrix RowMatrixXi; + typedef Matrix ColMatrix23i; + typedef Matrix ColMatrix32i; + typedef Matrix RowMatrix23i; + typedef Matrix RowMatrix32i; + + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 2, 3, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(MatrixXi::Map(data, 3, 2, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 2, 3, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrixXi::Map(data, 3, 2, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix23i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 2, 3, Stride<4,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<2>()), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + VERIFY_IS_APPROX(ColMatrix32i::Map(data, InnerStride<>(2)), MatrixXi::Map(data, 3, 2, Stride<6,2>())); + + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix23i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 2, 3, Stride<6,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<2>()), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); + VERIFY_IS_APPROX(RowMatrix32i::Map(data, InnerStride<>(2)), RowMatrixXi::Map(data, 3, 2, Stride<4,2>())); +} + +EIGEN_DECLARE_TEST(mapstride) { for(int i = 0; i < g_repeat; i++) { int maxn = 30; @@ -175,6 +226,8 @@ void test_mapstride() CALL_SUBTEST_5( map_class_matrix(MatrixXi(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); CALL_SUBTEST_6( map_class_matrix(MatrixXcd(internal::random(1,maxn),internal::random(1,maxn))) ); + + CALL_SUBTEST_5( bug1453<0>() ); TEST_SET_BUT_UNUSED_VARIABLE(maxn); } diff --git a/test/meta.cpp b/test/meta.cpp index b8dea68e8..ea9607fe7 100644 --- a/test/meta.cpp +++ b/test/meta.cpp @@ -15,7 +15,19 @@ bool check_is_convertible(const From&, const To&) return internal::is_convertible::value; } -void test_meta() +struct FooReturnType { + typedef int ReturnType; +}; + +struct MyInterface { + virtual void func() = 0; + virtual ~MyInterface() {} +}; +struct MyImpl : public MyInterface { + void func() {} +}; + +EIGEN_DECLARE_TEST(meta) { VERIFY((internal::conditional<(3<4),internal::true_type, internal::false_type>::type::value)); VERIFY(( internal::is_same::value)); @@ -58,14 +70,27 @@ void test_meta() VERIFY(( internal::is_same::type >::value)); VERIFY(( internal::is_same::type >::value)); - VERIFY(( internal::is_convertible::value )); - VERIFY(( internal::is_convertible::value )); - VERIFY(( internal::is_convertible::value )); - VERIFY((!internal::is_convertible,double>::value )); - VERIFY(( internal::is_convertible::value )); -// VERIFY((!internal::is_convertible::value )); //does not work because the conversion is prevented by a static assertion - VERIFY((!internal::is_convertible::value )); - VERIFY((!internal::is_convertible::value )); + + // is_convertible + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible >::value )); + STATIC_CHECK((!internal::is_convertible,double>::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); + STATIC_CHECK((!internal::is_convertible::value )); + STATIC_CHECK((!internal::is_convertible::value )); + STATIC_CHECK(( internal::is_convertible::value )); // std::is_convertible returns false here though Matrix3f from; Matrix3f& to = from; is valid. + //STATIC_CHECK((!internal::is_convertible::value )); //does not work because the conversion is prevented by a static assertion + STATIC_CHECK((!internal::is_convertible::value )); + STATIC_CHECK((!internal::is_convertible::value )); { float f; MatrixXf A, B; @@ -75,6 +100,26 @@ void test_meta() VERIFY((!check_is_convertible(A*B, f) )); VERIFY(( check_is_convertible(A*B, A) )); } + + STATIC_CHECK(( !internal::is_convertible::value )); + #if (!EIGEN_COMP_GNUC_STRICT) || (EIGEN_GNUC_AT_LEAST(4,8)) + // GCC prior to 4.8 fails to compile this test: + // error: cannot allocate an object of abstract type 'MyInterface' + // In other word, it does not obey SFINAE. + // Nevertheless, we don't really care about supporting abstract type as scalar type! + STATIC_CHECK(( !internal::is_convertible::value )); + #endif + STATIC_CHECK(( internal::is_convertible::value )); + { + int i; + VERIFY(( check_is_convertible(fix<3>(), i) )); + VERIFY((!check_is_convertible(i, fix()) )); + } + + VERIFY(( internal::has_ReturnType::value )); + VERIFY(( internal::has_ReturnType >::value )); + VERIFY(( !internal::has_ReturnType::value )); + VERIFY(( !internal::has_ReturnType::value )); VERIFY(internal::meta_sqrt<1>::ret == 1); #define VERIFY_META_SQRT(X) VERIFY(internal::meta_sqrt::ret == int(std::sqrt(double(X)))) diff --git a/test/metis_support.cpp b/test/metis_support.cpp index d87c56a13..b490dacde 100644 --- a/test/metis_support.cpp +++ b/test/metis_support.cpp @@ -19,7 +19,7 @@ template void test_metis_T() check_sparse_square_solving(sparselu_metis); } -void test_metis_support() +EIGEN_DECLARE_TEST(metis_support) { CALL_SUBTEST_1(test_metis_T()); } diff --git a/test/miscmatrices.cpp b/test/miscmatrices.cpp index ef20dc749..e71712f33 100644 --- a/test/miscmatrices.cpp +++ b/test/miscmatrices.cpp @@ -14,7 +14,6 @@ template void miscMatrices(const MatrixType& m) /* this test covers the following files: DiagonalMatrix.h Ones.h */ - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef Matrix VectorType; Index rows = m.rows(); @@ -35,7 +34,7 @@ template void miscMatrices(const MatrixType& m) VERIFY_IS_APPROX(square, MatrixType::Identity(rows, rows)); } -void test_miscmatrices() +EIGEN_DECLARE_TEST(miscmatrices) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( miscMatrices(Matrix()) ); diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index b796082cd..aad63ec2b 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -8,13 +8,27 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// work around "uninitialized" warnings and give that option some testing -#define EIGEN_INITIALIZE_MATRICES_BY_ZERO +#if defined(EIGEN_TEST_PART_7) #ifndef EIGEN_NO_STATIC_ASSERT #define EIGEN_NO_STATIC_ASSERT // turn static asserts into runtime asserts in order to check them #endif +// ignore double-promotion diagnostic for clang and gcc, if we check for static assertion anyway: +// TODO do the same for MSVC? +#if defined(__clang__) +# if (__clang_major__ * 100 + __clang_minor__) >= 308 +# pragma clang diagnostic ignored "-Wdouble-promotion" +# endif +#elif defined(__GNUC__) + // TODO is there a minimal GCC version for this? At least g++-4.7 seems to be fine with this. +# pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif + +#endif + + + #if defined(EIGEN_TEST_PART_1) || defined(EIGEN_TEST_PART_2) || defined(EIGEN_TEST_PART_3) #ifndef EIGEN_DONT_VECTORIZE @@ -35,6 +49,28 @@ using namespace std; VERIFY_IS_APPROX(XPR,REF); \ VERIFY( g_called && #XPR" not properly optimized"); +template +void raise_assertion(Index size = SizeAtCompileType) +{ + // VERIFY_RAISES_ASSERT(mf+md); // does not even compile + Matrix vf; vf.setRandom(size); + Matrix vd; vd.setRandom(size); + VERIFY_RAISES_ASSERT(vf=vd); + VERIFY_RAISES_ASSERT(vf+=vd); + VERIFY_RAISES_ASSERT(vf-=vd); + VERIFY_RAISES_ASSERT(vd=vf); + VERIFY_RAISES_ASSERT(vd+=vf); + VERIFY_RAISES_ASSERT(vd-=vf); + + // vd.asDiagonal() * mf; // does not even compile + // vcd.asDiagonal() * mf; // does not even compile + +#if 0 // we get other compilation errors here than just static asserts + VERIFY_RAISES_ASSERT(vd.dot(vf)); +#endif +} + + template void mixingtypes(int size = SizeAtCompileType) { typedef std::complex CF; @@ -73,13 +109,6 @@ template void mixingtypes(int size = SizeAtCompileType) while(std::abs(scf)(); while(std::abs(scd)(); -// VERIFY_RAISES_ASSERT(mf+md); // does not even compile - -#ifdef EIGEN_DONT_VECTORIZE - VERIFY_RAISES_ASSERT(vf=vd); - VERIFY_RAISES_ASSERT(vf+=vd); -#endif - // check scalar products VERIFY_MIX_SCALAR(vcf * sf , vcf * complex(sf)); VERIFY_MIX_SCALAR(sd * vcd , complex(sd) * vcd); @@ -119,9 +148,6 @@ template void mixingtypes(int size = SizeAtCompileType) // check dot product vf.dot(vf); -#if 0 // we get other compilation errors here than just static asserts - VERIFY_RAISES_ASSERT(vd.dot(vf)); -#endif VERIFY_IS_APPROX(vcf.dot(vf), vcf.dot(vf.template cast >())); // check diagonal product @@ -130,9 +156,6 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX(mcf * vf.asDiagonal(), mcf * vf.template cast >().asDiagonal()); VERIFY_IS_APPROX(md * vcd.asDiagonal(), md.template cast >() * vcd.asDiagonal()); -// vd.asDiagonal() * mf; // does not even compile -// vcd.asDiagonal() * mf; // does not even compile - // check inner product VERIFY_IS_APPROX((vf.transpose() * vcf).value(), (vf.template cast >().transpose() * vcf).value()); @@ -286,7 +309,7 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_IS_APPROX( rcd.noalias() -= mcd + md*md, - ((md*md).eval().template cast()) ); } -void test_mixingtypes() +EIGEN_DECLARE_TEST(mixingtypes) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(mixingtypes<3>()); @@ -296,5 +319,10 @@ void test_mixingtypes() CALL_SUBTEST_4(mixingtypes<3>()); CALL_SUBTEST_5(mixingtypes<4>()); CALL_SUBTEST_6(mixingtypes(internal::random(1,EIGEN_TEST_MAX_SIZE))); + CALL_SUBTEST_7(raise_assertion(internal::random(1,EIGEN_TEST_MAX_SIZE))); } + CALL_SUBTEST_7(raise_assertion<0>()); + CALL_SUBTEST_7(raise_assertion<3>()); + CALL_SUBTEST_7(raise_assertion<4>()); + CALL_SUBTEST_7(raise_assertion(0)); } diff --git a/test/nesting_ops.cpp b/test/nesting_ops.cpp index a419b0e44..4b5fc21f2 100644 --- a/test/nesting_ops.cpp +++ b/test/nesting_ops.cpp @@ -91,7 +91,7 @@ template void run_nesting_ops_2(const MatrixType& _m) } -void test_nesting_ops() +EIGEN_DECLARE_TEST(nesting_ops) { CALL_SUBTEST_1(run_nesting_ops_1(MatrixXf::Random(25,25))); CALL_SUBTEST_2(run_nesting_ops_1(MatrixXcd::Random(25,25))); diff --git a/test/nomalloc.cpp b/test/nomalloc.cpp index 50756c2fb..cb4c073e9 100644 --- a/test/nomalloc.cpp +++ b/test/nomalloc.cpp @@ -24,7 +24,6 @@ template void nomalloc(const MatrixType& m) { /* this test check no dynamic memory allocation are issued with fixed-size matrices */ - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -173,7 +172,7 @@ template void test_reference(const MatrixType& m) { typedef typename MatrixType::Scalar Scalar; enum { Flag = MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor}; enum { TransposeFlag = !MatrixType::IsRowMajor ? Eigen::RowMajor : Eigen::ColMajor}; - typename MatrixType::Index rows = m.rows(), cols=m.cols(); + Index rows = m.rows(), cols=m.cols(); typedef Eigen::Matrix MatrixX; typedef Eigen::Matrix MatrixXT; // Dynamic reference: @@ -203,7 +202,7 @@ template void test_reference(const MatrixType& m) { } -void test_nomalloc() +EIGEN_DECLARE_TEST(nomalloc) { // create some dynamic objects Eigen::MatrixXd M1 = MatrixXd::Random(3,3); diff --git a/test/nullary.cpp b/test/nullary.cpp index acd55506e..12b9e122f 100644 --- a/test/nullary.cpp +++ b/test/nullary.cpp @@ -191,6 +191,24 @@ void testVectorType(const VectorType& base) } } } + + // test setUnit() + if(m.size()>0) + { + for(Index k=0; k<10; ++k) + { + Index i = internal::random(0,m.size()-1); + m.setUnit(i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(), i) ); + } + if(VectorType::SizeAtCompileTime==Dynamic) + { + Index i = internal::random(0,2*m.size()-1); + m.setUnit(2*m.size(),i); + VERIFY_IS_APPROX( m, VectorType::Unit(m.size(),i) ); + } + } + } template @@ -221,45 +239,28 @@ void testMatrixType(const MatrixType& m) VERIFY_IS_APPROX( A(i,j), s1 ); } -void test_nullary() +template +void bug79() { - CALL_SUBTEST_1( testMatrixType(Matrix2d()) ); - CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random(1,300),internal::random(1,300))) ); - CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random(1,300),internal::random(1,300))) ); - - for(int i = 0; i < g_repeat*10; i++) { - CALL_SUBTEST_4( testVectorType(VectorXd(internal::random(1,30000))) ); - CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232 - CALL_SUBTEST_6( testVectorType(Vector3d()) ); - CALL_SUBTEST_7( testVectorType(VectorXf(internal::random(1,30000))) ); - CALL_SUBTEST_8( testVectorType(Vector3f()) ); - CALL_SUBTEST_8( testVectorType(Vector4f()) ); - CALL_SUBTEST_8( testVectorType(Matrix()) ); - CALL_SUBTEST_8( testVectorType(Matrix()) ); - - CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,10))) ); - CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(9,300))) ); - CALL_SUBTEST_9( testVectorType(Matrix()) ); - } - -#ifdef EIGEN_TEST_PART_6 // Assignment of a RowVectorXd to a MatrixXd (regression test for bug #79). VERIFY( (MatrixXd(RowVectorXd::LinSpaced(3, 0, 1)) - RowVector3d(0, 0.5, 1)).norm() < std::numeric_limits::epsilon() ); -#endif +} -#ifdef EIGEN_TEST_PART_9 +template +void nullary_overflow() +{ // Check possible overflow issue - { - int n = 60000; - ArrayXi a1(n), a2(n); - a1.setLinSpaced(n, 0, n-1); - for(int i=0; i +void nullary_internal_logic() +{ // check some internal logic VERIFY(( internal::has_nullary_operator >::value )); VERIFY(( !internal::has_unary_operator >::value )); @@ -300,5 +301,30 @@ void test_nullary() VERIFY(( !internal::has_binary_operator >::value )); VERIFY(( internal::functor_has_linear_access >::ret )); } -#endif +} + +EIGEN_DECLARE_TEST(nullary) +{ + CALL_SUBTEST_1( testMatrixType(Matrix2d()) ); + CALL_SUBTEST_2( testMatrixType(MatrixXcf(internal::random(1,300),internal::random(1,300))) ); + CALL_SUBTEST_3( testMatrixType(MatrixXf(internal::random(1,300),internal::random(1,300))) ); + + for(int i = 0; i < g_repeat*10; i++) { + CALL_SUBTEST_4( testVectorType(VectorXd(internal::random(1,30000))) ); + CALL_SUBTEST_5( testVectorType(Vector4d()) ); // regression test for bug 232 + CALL_SUBTEST_6( testVectorType(Vector3d()) ); + CALL_SUBTEST_7( testVectorType(VectorXf(internal::random(1,30000))) ); + CALL_SUBTEST_8( testVectorType(Vector3f()) ); + CALL_SUBTEST_8( testVectorType(Vector4f()) ); + CALL_SUBTEST_8( testVectorType(Matrix()) ); + CALL_SUBTEST_8( testVectorType(Matrix()) ); + + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(1,10))) ); + CALL_SUBTEST_9( testVectorType(VectorXi(internal::random(9,300))) ); + CALL_SUBTEST_9( testVectorType(Matrix()) ); + } + + CALL_SUBTEST_6( bug79<0>() ); + CALL_SUBTEST_9( nullary_overflow<0>() ); + CALL_SUBTEST_10( nullary_internal_logic<0>() ); } diff --git a/test/num_dimensions.cpp b/test/num_dimensions.cpp new file mode 100644 index 000000000..7ad7ef697 --- /dev/null +++ b/test/num_dimensions.cpp @@ -0,0 +1,90 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include + +template +void check_dim(const Xpr& ) { + STATIC_CHECK( Xpr::NumDimensions == ExpectedDim ); +} + +#if EIGEN_HAS_CXX11 +template