merge with default Eigen

This commit is contained in:
Gael Guennebaud 2018-09-21 11:51:49 +02:00
commit a488d59787
717 changed files with 39462 additions and 10488 deletions

View File

@ -13,7 +13,7 @@ core
core.* core.*
*.bak *.bak
*~ *~
build* *build*
*.moc.* *.moc.*
*.moc *.moc
ui_* ui_*

View File

@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ") message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
endif() endif()
# Alias Eigen_*_DIR to Eigen3_*_DIR: # Alias Eigen_*_DIR to Eigen3_*_DIR:
set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR}) set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}") set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION}) set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty, # if we are not in a mercurial clone
# but won't stop CMake. if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT) # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT) # but won't stop CMake.
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
endif()
# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output... # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
if(EIGEN_BRANCH_OUTPUT MATCHES "default") if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON) option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
endif() endif()
set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_INCLUDE_CURRENT_DIR OFF)
option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON) option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
@ -153,11 +157,7 @@ if(NOT MSVC)
ei_add_cxx_compiler_flag("-Wdouble-promotion") ei_add_cxx_compiler_flag("-Wdouble-promotion")
# ei_add_cxx_compiler_flag("-Wconversion") # ei_add_cxx_compiler_flag("-Wconversion")
# -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6 ei_add_cxx_compiler_flag("-Wshadow")
# if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
if(NOT CMAKE_COMPILER_IS_GNUCXX)
ei_add_cxx_compiler_flag("-Wshadow")
endif()
ei_add_cxx_compiler_flag("-Wno-psabi") ei_add_cxx_compiler_flag("-Wno-psabi")
ei_add_cxx_compiler_flag("-Wno-variadic-macros") ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@ -232,7 +232,10 @@ if(NOT MSVC)
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
if(EIGEN_TEST_AVX512) if(EIGEN_TEST_AVX512)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512")
if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
endif()
message(STATUS "Enabling AVX512 in tests/examples") message(STATUS "Enabling AVX512 in tests/examples")
endif() endif()
@ -254,6 +257,12 @@ if(NOT MSVC)
message(STATUS "Enabling VSX in tests/examples") message(STATUS "Enabling VSX in tests/examples")
endif() endif()
option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
if(EIGEN_TEST_MSA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
message(STATUS "Enabling MSA in tests/examples")
endif()
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF) option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
if(EIGEN_TEST_NEON) if(EIGEN_TEST_NEON)
if(EIGEN_TEST_FMA) if(EIGEN_TEST_FMA)
@ -271,12 +280,18 @@ if(NOT MSVC)
message(STATUS "Enabling NEON in tests/examples") message(STATUS "Enabling NEON in tests/examples")
endif() endif()
option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
if(EIGEN_TEST_ZVECTOR) if(EIGEN_TEST_Z13)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
endif() endif()
option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
if(EIGEN_TEST_Z14)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
endif()
check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
if(COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP)
option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@ -363,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
if(EIGEN_INCLUDE_INSTALL_DIR) if(EIGEN_INCLUDE_INSTALL_DIR)
@ -437,10 +452,17 @@ endif()
# add SYCL # add SYCL
option(EIGEN_TEST_SYCL "Add Sycl support." OFF) option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
if(EIGEN_TEST_SYCL) if(EIGEN_TEST_SYCL)
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}") set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
include(FindComputeCpp) if(EIGEN_SYCL_TRISYCL)
endif() message(STATUS "Using triSYCL")
include(FindTriSYCL)
else(EIGEN_SYCL_TRISYCL)
message(STATUS "Using ComputeCPP SYCL")
include(FindComputeCpp)
endif(EIGEN_SYCL_TRISYCL)
endif(EIGEN_TEST_SYCL)
add_subdirectory(unsupported) add_subdirectory(unsupported)
@ -516,6 +538,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
# Imported target support # Imported target support
add_library (eigen INTERFACE) add_library (eigen INTERFACE)
add_library (Eigen3::Eigen ALIAS eigen)
target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS}) target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
target_include_directories (eigen INTERFACE target_include_directories (eigen INTERFACE

View File

@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http")
set(CTEST_DROP_SITE "manao.inria.fr") set(CTEST_DROP_SITE "manao.inria.fr")
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen") set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
set(CTEST_DROP_SITE_CDASH TRUE) set(CTEST_DROP_SITE_CDASH TRUE)
set(CTEST_PROJECT_SUBPROJECTS #set(CTEST_PROJECT_SUBPROJECTS
Official #Official
Unsupported #Unsupported
) #)

View File

@ -1,3 +1,4 @@
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000") set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000") set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000")
list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@)

View File

@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H #define EIGEN_CHOLESKY_MODULE_H
#include "Core" #include "Core"
#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h" #include "src/Core/util/DisableStupidWarnings.h"
@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h" #include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h" #include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/Cholesky/LLT_LAPACKE.h" #include "src/Cholesky/LLT_LAPACKE.h"
#endif #endif

View File

@ -14,61 +14,26 @@
// first thing Eigen does: stop the compiler from committing suicide // first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h" #include "src/Core/util/DisableStupidWarnings.h"
// Handle NVCC/CUDA/SYCL // then include this file where all our macros are defined. It's really important to do it first because
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) // it's where we do all the compiler/OS/arch detections and define most defaults.
// Do not try asserts on CUDA and SYCL! #include "src/Core/util/Macros.h"
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
#endif
#ifdef EIGEN_INTERNAL_DEBUGGING // This detects SSE/AVX/NEON/etc. and configure alignment settings
#undef EIGEN_INTERNAL_DEBUGGING #include "src/Core/util/ConfigureVectorization.h"
#endif
#ifdef EIGEN_EXCEPTIONS // We need cuda_runtime.h/hip_runtime.h to ensure that
#undef EIGEN_EXCEPTIONS // the EIGEN_USING_STD_MATH macro works properly on the device side
#endif #if defined(EIGEN_CUDACC)
#include <cuda_runtime.h>
// All functions callable from CUDA code must be qualified with __device__ #elif defined(EIGEN_HIPCC)
#ifdef __CUDACC__ #include <hip/hip_runtime.h>
// Do not try to vectorize on CUDA and SYCL!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#define EIGEN_DEVICE_FUNC __host__ __device__
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <math_functions.hpp>
#else
#define EIGEN_DEVICE_FUNC
#endif
#else
#define EIGEN_DEVICE_FUNC
#endif #endif
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the
// std versions.
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#define EIGEN_EXCEPTIONS
#endif
#ifdef EIGEN_EXCEPTIONS #ifdef EIGEN_EXCEPTIONS
#include <new> #include <new>
#endif #endif
// then include this file where all our macros are defined. It's really important to do it first because
// it's where we do all the alignment settings (platform detection and honoring the user's will if he
// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
#include "src/Core/util/Macros.h"
// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@ -81,169 +46,9 @@
// and inclusion of their respective header files // and inclusion of their respective header files
#include "src/Core/util/MKL_support.h" #include "src/Core/util/MKL_support.h"
// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
#if EIGEN_MAX_ALIGN_BYTES==0
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#endif
#if EIGEN_COMP_MSVC #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled #define EIGEN_HAS_GPU_FP16
#if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
// Remember that usage of defined() in a #define is undefined by the standard.
// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
#endif
#endif
#else
// Remember that usage of defined() in a #define is undefined by the standard
#if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
#define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
#endif
#endif
#ifndef EIGEN_DONT_VECTORIZE
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
// Defines symbols for compile-time detection of which instructions are
// used.
// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_SSE
#define EIGEN_VECTORIZE_SSE2
// Detect sse3/ssse3/sse4:
// gcc and icc defines __SSE3__, ...
// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
// want to force the use of those instructions with msvc.
#ifdef __SSE3__
#define EIGEN_VECTORIZE_SSE3
#endif
#ifdef __SSSE3__
#define EIGEN_VECTORIZE_SSSE3
#endif
#ifdef __SSE4_1__
#define EIGEN_VECTORIZE_SSE4_1
#endif
#ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __AVX__
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __AVX2__
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
#if defined(__AVX512F__)
#define EIGEN_VECTORIZE_AVX512
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_FMA
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#ifdef __AVX512DQ__
#define EIGEN_VECTORIZE_AVX512DQ
#endif
#endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue
// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
// with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
// notice that since these are C headers, the extern "C" is theoretically needed anyways.
extern "C" {
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
#if EIGEN_COMP_ICC >= 1110
#include <immintrin.h>
#else
#include <mmintrin.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#ifdef EIGEN_VECTORIZE_SSE3
#include <pmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSSE3
#include <tmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_1
#include <smmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
#include <immintrin.h>
#endif
#endif
} // end extern "C"
#elif defined __VSX__
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_VSX
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
#undef bool
#undef vector
#undef pixel
#elif defined __ALTIVEC__
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ALTIVEC
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
#undef bool
#undef vector
#undef pixel
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_NEON
#include <arm_neon.h>
#elif (defined __s390x__ && defined __VEC__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ZVECTOR
#include <vecintrin.h>
#endif
#endif
#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
// We can use the optimized fp16 to float and float to fp16 conversion routines
#define EIGEN_HAS_FP16_C
#endif
#if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
#if defined EIGEN_HAS_CUDA_FP16
#include <host_defines.h>
#include <cuda_fp16.h>
#endif #endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@ -275,6 +80,10 @@
// for min/max: // for min/max:
#include <algorithm> #include <algorithm>
#if EIGEN_HAS_CXX11
#include <array>
#endif
// for std::is_nothrow_move_assignable // for std::is_nothrow_move_assignable
#ifdef EIGEN_INCLUDE_TYPE_TRAITS #ifdef EIGEN_INCLUDE_TYPE_TRAITS
#include <type_traits> #include <type_traits>
@ -299,38 +108,6 @@
#include <SYCL/sycl.hpp> #include <SYCL/sycl.hpp>
#endif #endif
/** \brief Namespace containing all symbols from the %Eigen library. */
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_AVX512)
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
#elif defined(EIGEN_VECTORIZE_SSSE3)
return "SSE, SSE2, SSE3, SSSE3";
#elif defined(EIGEN_VECTORIZE_SSE3)
return "SSE, SSE2, SSE3";
#elif defined(EIGEN_VECTORIZE_SSE2)
return "SSE, SSE2";
#elif defined(EIGEN_VECTORIZE_ALTIVEC)
return "AltiVec";
#elif defined(EIGEN_VECTORIZE_VSX)
return "VSX";
#elif defined(EIGEN_VECTORIZE_NEON)
return "ARM NEON";
#elif defined(EIGEN_VECTORIZE_ZVECTOR)
return "S390X ZVECTOR";
#else
return "None";
#endif
}
} // end namespace Eigen
#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
// This will generate an error message: // This will generate an error message:
@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
namespace Eigen { namespace Eigen {
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // we use size_t frequently and we'll never remember to prepend it with std:: every time just to
// ensure QNX/QCC support // ensure QNX/QCC support
using std::size_t; using std::size_t;
// gcc 4.6.0 wants std:: for ptrdiff_t // gcc 4.6.0 wants std:: for ptrdiff_t
@ -366,11 +143,11 @@ using std::ptrdiff_t;
#include "src/Core/util/IntegralConstant.h" #include "src/Core/util/IntegralConstant.h"
#include "src/Core/util/SymbolicIndex.h" #include "src/Core/util/SymbolicIndex.h"
#include "src/Core/NumTraits.h" #include "src/Core/NumTraits.h"
#include "src/Core/MathFunctions.h" #include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h" #include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h" #include "src/Core/MathFunctionsImpl.h"
#include "src/Core/arch/Default/ConjHelper.h"
#if defined EIGEN_VECTORIZE_AVX512 #if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
@ -388,6 +165,7 @@ using std::ptrdiff_t;
#include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX/TypeCasting.h" #include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_SSE #elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/MathFunctions.h"
@ -401,22 +179,33 @@ using std::ptrdiff_t;
#include "src/Core/arch/NEON/PacketMath.h" #include "src/Core/arch/NEON/PacketMath.h"
#include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/MathFunctions.h"
#include "src/Core/arch/NEON/Complex.h" #include "src/Core/arch/NEON/Complex.h"
#include "src/Core/arch/NEON/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_ZVECTOR #elif defined EIGEN_VECTORIZE_ZVECTOR
#include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/PacketMath.h"
#include "src/Core/arch/ZVector/MathFunctions.h" #include "src/Core/arch/ZVector/MathFunctions.h"
#include "src/Core/arch/ZVector/Complex.h" #include "src/Core/arch/ZVector/Complex.h"
#elif defined EIGEN_VECTORIZE_MSA
#include "src/Core/arch/MSA/PacketMath.h"
#include "src/Core/arch/MSA/MathFunctions.h"
#include "src/Core/arch/MSA/Complex.h"
#endif #endif
// Half float support // Half float support
#include "src/Core/arch/CUDA/Half.h" #include "src/Core/arch/GPU/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h" #include "src/Core/arch/GPU/PacketMathHalf.h"
#include "src/Core/arch/CUDA/TypeCasting.h" #include "src/Core/arch/GPU/TypeCasting.h"
#if defined EIGEN_VECTORIZE_CUDA #if defined EIGEN_VECTORIZE_GPU
#include "src/Core/arch/CUDA/PacketMath.h" #include "src/Core/arch/GPU/PacketMath.h"
#include "src/Core/arch/CUDA/MathFunctions.h" #include "src/Core/arch/GPU/MathFunctions.h"
#endif #endif
#if defined EIGEN_VECTORIZE_SYCL
#include "src/Core/arch/SYCL/InteropHeaders.h"
#include "src/Core/arch/SYCL/PacketMath.h"
#include "src/Core/arch/SYCL/MathFunctions.h"
#include "src/Core/arch/SYCL/TypeCasting.h"
#endif
#include "src/Core/arch/Default/Settings.h" #include "src/Core/arch/Default/Settings.h"
#include "src/Core/functors/TernaryFunctors.h" #include "src/Core/functors/TernaryFunctors.h"
@ -428,7 +217,9 @@ using std::ptrdiff_t;
// Specialized functors to enable the processing of complex numbers // Specialized functors to enable the processing of complex numbers
// on CUDA devices // on CUDA devices
#ifdef EIGEN_CUDACC
#include "src/Core/arch/CUDA/Complex.h" #include "src/Core/arch/CUDA/Complex.h"
#endif
#include "src/Core/util/IndexedViewHelper.h" #include "src/Core/util/IndexedViewHelper.h"
#include "src/Core/util/ReshapedHelper.h" #include "src/Core/util/ReshapedHelper.h"

View File

@ -10,14 +10,14 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky" #include "Cholesky"
#include "Jacobi" #include "Jacobi"
#include "Householder" #include "Householder"
#include "LU" #include "LU"
#include "Geometry" #include "Geometry"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Eigenvalues_Module Eigenvalues module /** \defgroup Eigenvalues_Module Eigenvalues module
* *
* *
@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"

View File

@ -10,12 +10,12 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "SVD" #include "SVD"
#include "LU" #include "LU"
#include <limits> #include <limits>
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Geometry_Module Geometry module /** \defgroup Geometry_Module Geometry module
* *
* This module provides support for: * This module provides support for:

41
Eigen/KLUSupport Normal file
View File

@ -0,0 +1,41 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_KLUSUPPORT_MODULE_H
#define EIGEN_KLUSUPPORT_MODULE_H
#include <Eigen/SparseCore>
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
extern "C" {
#include <btf.h>
#include <klu.h>
}
/** \ingroup Support_modules
* \defgroup KLUSupport_Module KLUSupport module
*
* This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
* It provides the following factorization class:
* - class KLU: a sparse LU factorization, well-suited for circuit simulation.
*
* \code
* #include <Eigen/KLUSupport>
* \endcode
*
* In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
* The dependencies depend on how umfpack has been compiled.
* For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
*
*/
#include "src/KLUSupport/KLUSupport.h"
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#endif // EIGEN_KLUSUPPORT_MODULE_H

View File

@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h" #include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h" #include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/LU/PartialPivLU_LAPACKE.h" #include "src/LU/PartialPivLU_LAPACKE.h"
#endif #endif
#include "src/LU/Determinant.h" #include "src/LU/Determinant.h"

View File

@ -36,6 +36,7 @@ extern "C" {
* \endcode * \endcode
* *
* In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
* This wrapper resuires PaStiX version 5.x compiled without MPI support.
* The dependencies depend on how PaSTiX has been compiled. * The dependencies depend on how PaSTiX has been compiled.
* For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
* *

0
Eigen/PardisoSupport Executable file → Normal file
View File

View File

@ -10,12 +10,12 @@
#include "Core" #include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky" #include "Cholesky"
#include "Jacobi" #include "Jacobi"
#include "Householder" #include "Householder"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup QR_Module QR module /** \defgroup QR_Module QR module
* *
* *
@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h" #include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h" #include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE #ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif #endif

View File

@ -27,7 +27,7 @@ void qFree(void *ptr)
void *qRealloc(void *ptr, std::size_t size) void *qRealloc(void *ptr, std::size_t size)
{ {
void* newPtr = Eigen::internal::aligned_malloc(size); void* newPtr = Eigen::internal::aligned_malloc(size);
memcpy(newPtr, ptr, size); std::memcpy(newPtr, ptr, size);
Eigen::internal::aligned_free(ptr); Eigen::internal::aligned_free(ptr);
return newPtr; return newPtr;
} }

View File

@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h" #include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h" #include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h" #include "src/misc/lapacke.h"
#endif
#include "src/SVD/JacobiSVD_LAPACKE.h" #include "src/SVD/JacobiSVD_LAPACKE.h"
#endif #endif

View File

@ -23,6 +23,8 @@
// Ordering interface // Ordering interface
#include "OrderingMethods" #include "OrderingMethods"
#include "src/Core/util/DisableStupidWarnings.h"
#include "src/SparseLU/SparseLU_gemm_kernel.h" #include "src/SparseLU/SparseLU_gemm_kernel.h"
#include "src/SparseLU/SparseLU_Structs.h" #include "src/SparseLU/SparseLU_Structs.h"
@ -43,4 +45,6 @@
#include "src/SparseLU/SparseLU_Utils.h" #include "src/SparseLU/SparseLU_Utils.h"
#include "src/SparseLU/SparseLU.h" #include "src/SparseLU/SparseLU.h"
#include "src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_SPARSELU_MODULE_H #endif // EIGEN_SPARSELU_MODULE_H

View File

@ -28,7 +28,6 @@
* *
*/ */
#include "OrderingMethods"
#include "src/SparseCore/SparseColEtree.h" #include "src/SparseCore/SparseColEtree.h"
#include "src/SparseQR/SparseQR.h" #include "src/SparseQR/SparseQR.h"

View File

@ -247,8 +247,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful. /** \brief Reports whether previous computation was successful.
* *
* \returns \c Success if computation was succesful, * \returns \c Success if computation was successful,
* \c NumericalIssue if the matrix.appears to be negative. * \c NumericalIssue if the factorization failed because of a zero pivot.
*/ */
ComputationInfo info() const ComputationInfo info() const
{ {
@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType> template<typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl(const RhsType &rhs, DstType &dst) const; void _solve_impl(const RhsType &rhs, DstType &dst) const;
#endif #endif
@ -376,6 +375,8 @@ template<> struct ldlt_inplace<Lower>
if((rs>0) && pivot_is_valid) if((rs>0) && pivot_is_valid)
A21 /= realAkk; A21 /= realAkk;
else if(rs>0)
ret = ret && (A21.array()==Scalar(0)).all();
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
else if(!pivot_is_valid) found_zero_pivot = true; else if(!pivot_is_valid) found_zero_pivot = true;
@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
// more precisely, use pseudo-inverse of D (see bug 241) // more precisely, use pseudo-inverse of D (see bug 241)
using std::abs; using std::abs;
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD()); const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
// as motivated by LAPACK's xGELSS: // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest()); // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
// diagonal element is not well justified and leads to numerical issues in some cases. // diagonal element is not well justified and leads to numerical issues in some cases.
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest(); // Using numeric_limits::min() gives us more robustness to denormals.
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
for (Index i = 0; i < vecD.size(); ++i) for (Index i = 0; i < vecD.size(); ++i)
{ {

View File

@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* *
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
* The other triangular part won't be read. * The other triangular part won't be read.
* *
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular. * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp * Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out * Output: \verbinclude LLT_example.out
* *
* \b Performance: for best performance, it is recommended to use a column-major storage format
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
* step, and rank-updates can be up to 3 times slower.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
* *
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
* Therefore, the strict lower part does not have to store correct values.
*
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/ */
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
* the strict lower part does not have to store correct values.
*/
template<typename _MatrixType, int _UpLo> class LLT template<typename _MatrixType, int _UpLo> class LLT
{ {
public: public:
@ -96,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
compute(matrix.derived()); compute(matrix.derived());
} }
/** \brief Constructs a LDLT factorization from a given matrix /** \brief Constructs a LLT factorization from a given matrix
* *
* This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
* \c MatrixType is a Eigen::Ref. * \c MatrixType is a Eigen::Ref.
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
} }
template<typename Derived> template<typename Derived>
void solveInPlace(MatrixBase<Derived> &bAndX) const; void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType> template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix); LLT& compute(const EigenBase<InputType>& matrix);
@ -176,8 +180,8 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful. /** \brief Reports whether previous computation was successful.
* *
* \returns \c Success if computation was succesful, * \returns \c Success if computation was successful,
* \c NumericalIssue if the matrix.appears to be negative. * \c NumericalIssue if the matrix.appears not to be positive definite.
*/ */
ComputationInfo info() const ComputationInfo info() const
{ {
@ -196,11 +200,10 @@ template<typename _MatrixType, int _UpLo> class LLT
inline Index cols() const { return m_matrix.cols(); } inline Index cols() const { return m_matrix.cols(); }
template<typename VectorType> template<typename VectorType>
LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType> template<typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl(const RhsType &rhs, DstType &dst) const; void _solve_impl(const RhsType &rhs, DstType &dst) const;
#endif #endif
@ -425,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols()); eigen_assert(a.rows()==a.cols());
const Index size = a.rows(); const Index size = a.rows();
m_matrix.resize(size, size); m_matrix.resize(size, size);
m_matrix = a.derived(); if (!internal::is_same_dense(m_matrix, a.derived()))
m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum. // Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0); m_l1_norm = RealScalar(0);
@ -454,7 +458,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
*/ */
template<typename _MatrixType, int _UpLo> template<typename _MatrixType, int _UpLo>
template<typename VectorType> template<typename VectorType>
LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
eigen_assert(v.size()==m_matrix.cols()); eigen_assert(v.size()==m_matrix.cols());
@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
* *
* This version avoids a copy when the right hand side matrix b is not needed anymore. * This version avoids a copy when the right hand side matrix b is not needed anymore.
* *
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* \sa LLT::solve(), MatrixBase::llt() * \sa LLT::solve(), MatrixBase::llt()
*/ */
template<typename MatrixType, int _UpLo> template<typename MatrixType, int _UpLo>
template<typename Derived> template<typename Derived>
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{ {
eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows()); eigen_assert(m_matrix.rows()==bAndX.rows());

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_CHOLMODSUPPORT_H #ifndef EIGEN_CHOLMODSUPPORT_H
#define EIGEN_CHOLMODSUPPORT_H #define EIGEN_CHOLMODSUPPORT_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
res.dtype = 0; res.dtype = 0;
res.stype = -1; res.stype = -1;
if (internal::is_same<_StorageIndex,int>::value) if (internal::is_same<_StorageIndex,int>::value)
{ {
res.itype = CHOLMOD_INT; res.itype = CHOLMOD_INT;
} }
else if (internal::is_same<_StorageIndex,long>::value) else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
{ {
res.itype = CHOLMOD_LONG; res.itype = CHOLMOD_LONG;
} }
@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
// setup res.xtype // setup res.xtype
internal::cholmod_configure_matrix<_Scalar>::run(res); internal::cholmod_configure_matrix<_Scalar>::run(res);
res.stype = 0; res.stype = 0;
return res; return res;
} }
@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat) cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
{ {
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived())); cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
if(UpLo==Upper) res.stype = 1; if(UpLo==Upper) res.stype = 1;
if(UpLo==Lower) res.stype = -1; if(UpLo==Lower) res.stype = -1;
// swap stype for rowmajor matrices (only works for real matrices) // swap stype for rowmajor matrices (only works for real matrices)
@ -167,12 +167,12 @@ namespace internal {
// template specializations for int and long that call the correct cholmod method // template specializations for int and long that call the correct cholmod method
#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
template<typename _StorageIndex> ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ template<typename _StorageIndex> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
template<> ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } template<> inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
template<typename _StorageIndex> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ template<typename _StorageIndex> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
template<> ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } template<> inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
EIGEN_CHOLMOD_SPECIALIZE0(int, start) EIGEN_CHOLMOD_SPECIALIZE0(int, start)
EIGEN_CHOLMOD_SPECIALIZE0(int, finish) EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
template<typename _StorageIndex> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } template<typename _StorageIndex> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
template<> cholmod_dense* cm_solve<long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } template<> inline cholmod_dense* cm_solve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
template<typename _StorageIndex> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
template<> cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } template<> inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
template<typename _StorageIndex> template<typename _StorageIndex>
int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
template<> template<>
int cm_factorize_p<long> (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } inline int cm_factorize_p<SuiteSparse_long> (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
#undef EIGEN_CHOLMOD_SPECIALIZE0 #undef EIGEN_CHOLMOD_SPECIALIZE0
#undef EIGEN_CHOLMOD_SPECIALIZE1 #undef EIGEN_CHOLMOD_SPECIALIZE1
@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod); internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
internal::cm_finish<StorageIndex>(m_cholmod); internal::cm_finish<StorageIndex>(m_cholmod);
} }
inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); } inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); } inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
/** \brief Reports whether previous computation was successful. /** \brief Reports whether previous computation was successful.
* *
* \returns \c Success if computation was successful, * \returns \c Success if computation was successful,
@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
factorize(matrix); factorize(matrix);
return derived(); return derived();
} }
/** Performs a symbolic decomposition on the sparsity pattern of \a matrix. /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
* *
* This function is particularly useful when solving for several problems having the same structure. * This function is particularly useful when solving for several problems having the same structure.
* *
* \sa factorize() * \sa factorize()
*/ */
void analyzePattern(const MatrixType& matrix) void analyzePattern(const MatrixType& matrix)
@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
} }
cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>()); cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod); m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
this->m_isInitialized = true; this->m_isInitialized = true;
this->m_info = Success; this->m_info = Success;
m_analysisIsOk = true; m_analysisIsOk = true;
m_factorizationIsOk = false; m_factorizationIsOk = false;
} }
/** Performs a numeric decomposition of \a matrix /** Performs a numeric decomposition of \a matrix
* *
* The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue); this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
m_factorizationIsOk = true; m_factorizationIsOk = true;
} }
/** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations. /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
* See the Cholmod user guide for details. */ * See the Cholmod user guide for details. */
cholmod_common& cholmod() { return m_cholmod; } cholmod_common& cholmod() { return m_cholmod; }
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
/** \internal */ /** \internal */
template<typename Rhs,typename Dest> template<typename Rhs,typename Dest>
@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
const Index size = m_cholmodFactor->n; const Index size = m_cholmodFactor->n;
EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(size);
eigen_assert(size==b.rows()); eigen_assert(size==b.rows());
// Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref. // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived()); Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols()); dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod); internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
} }
/** \internal */ /** \internal */
template<typename RhsDerived, typename DestDerived> template<typename RhsDerived, typename DestDerived>
void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod); internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
} }
#endif // EIGEN_PARSED_BY_DOXYGEN #endif // EIGEN_PARSED_BY_DOXYGEN
/** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization. /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
* *
* During the numerical factorization, an offset term is added to the diagonal coefficients:\n * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
m_shiftOffset[0] = double(offset); m_shiftOffset[0] = double(offset);
return derived(); return derived();
} }
/** \returns the determinant of the underlying matrix from the current factorization */ /** \returns the determinant of the underlying matrix from the current factorization */
Scalar determinant() const Scalar determinant() const
{ {
@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
template<typename Stream> template<typename Stream>
void dumpMemory(Stream& /*s*/) void dumpMemory(Stream& /*s*/)
{} {}
protected: protected:
mutable cholmod_common m_cholmod; mutable cholmod_common m_cholmod;
cholmod_factor* m_cholmodFactor; cholmod_factor* m_cholmodFactor;
@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
{ {
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
using Base::m_cholmod; using Base::m_cholmod;
public: public:
typedef _MatrixType MatrixType; typedef _MatrixType MatrixType;
CholmodSimplicialLLT() : Base() { init(); } CholmodSimplicialLLT() : Base() { init(); }
CholmodSimplicialLLT(const MatrixType& matrix) : Base() CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
{ {
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
using Base::m_cholmod; using Base::m_cholmod;
public: public:
typedef _MatrixType MatrixType; typedef _MatrixType MatrixType;
CholmodSimplicialLDLT() : Base() { init(); } CholmodSimplicialLDLT() : Base() { init(); }
CholmodSimplicialLDLT(const MatrixType& matrix) : Base() CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
{ {
typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
using Base::m_cholmod; using Base::m_cholmod;
public: public:
typedef _MatrixType MatrixType; typedef _MatrixType MatrixType;
CholmodSupernodalLLT() : Base() { init(); } CholmodSupernodalLLT() : Base() { init(); }
CholmodSupernodalLLT(const MatrixType& matrix) : Base() CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
{ {
typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
using Base::m_cholmod; using Base::m_cholmod;
public: public:
typedef _MatrixType MatrixType; typedef _MatrixType MatrixType;
CholmodDecomposition() : Base() { init(); } CholmodDecomposition() : Base() { init(); }
CholmodDecomposition(const MatrixType& matrix) : Base() CholmodDecomposition(const MatrixType& matrix) : Base()
@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
} }
~CholmodDecomposition() {} ~CholmodDecomposition() {}
void setMode(CholmodMode mode) void setMode(CholmodMode mode)
{ {
switch(mode) switch(mode)

View File

@ -29,17 +29,17 @@ template<int N> struct aseq_negate<FixedInt<N> > {
template<> struct aseq_negate<FixedInt<DynamicIndex> > {}; template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
template<typename FirstType,typename SizeType,typename IncrType, template<typename FirstType,typename SizeType,typename IncrType,
bool FirstIsSymbolic=Symbolic::is_symbolic<FirstType>::value, bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
bool SizeIsSymbolic =Symbolic::is_symbolic<SizeType>::value> bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
struct aseq_reverse_first_type { struct aseq_reverse_first_type {
typedef Index type; typedef Index type;
}; };
template<typename FirstType,typename SizeType,typename IncrType> template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> { struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
typedef Symbolic::AddExpr<FirstType, typedef symbolic::AddExpr<FirstType,
Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >, symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
Symbolic::ValueExpr<IncrType> > symbolic::ValueExpr<IncrType> >
> type; > type;
}; };
@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_i
template<typename FirstType,typename SizeType,typename IncrType> template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> { struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux; typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
typedef Symbolic::AddExpr<FirstType,Symbolic::ValueExpr<Aux> > type; typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
}; };
template<typename FirstType,typename SizeType,typename IncrType> template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> { struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
typedef Symbolic::AddExpr<Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >, typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
Symbolic::ValueExpr<IncrType> >, symbolic::ValueExpr<IncrType> >,
Symbolic::ValueExpr<> > type; symbolic::ValueExpr<> > type;
}; };
#endif #endif
@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr)
-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
CleanedIncrType(incr)); CleanedIncrType(incr));
} }
#else
#else // EIGEN_HAS_CXX11
template<typename FirstType,typename LastType> template<typename FirstType,typename LastType>
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value), typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
seq(FirstType f, LastType l) seq(FirstType f, LastType l)
{ {
@ -237,35 +238,35 @@ seq(FirstType f, LastType l)
} }
template<typename FirstTypeDerived,typename LastType> template<typename FirstTypeDerived,typename LastType>
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value, typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
ArithmeticSequence<FirstTypeDerived, Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,Symbolic::ValueExpr<> >, ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l) seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
{ {
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>())); return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
} }
template<typename FirstType,typename LastTypeDerived> template<typename FirstType,typename LastTypeDerived>
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value, typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type, ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >, symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l) seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
{ {
return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())); return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
} }
template<typename FirstTypeDerived,typename LastTypeDerived> template<typename FirstTypeDerived,typename LastTypeDerived>
ArithmeticSequence<FirstTypeDerived, ArithmeticSequence<FirstTypeDerived,
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::NegateExpr<FirstTypeDerived> >,Symbolic::ValueExpr<internal::FixedInt<1> > > > symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l) seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
{ {
return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
} }
template<typename FirstType,typename LastType, typename IncrType> template<typename FirstType,typename LastType, typename IncrType>
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value), typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(FirstType f, LastType l, IncrType incr) seq(FirstType f, LastType l, IncrType incr)
{ {
@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr)
} }
template<typename FirstTypeDerived,typename LastType, typename IncrType> template<typename FirstTypeDerived,typename LastType, typename IncrType>
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value, typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
ArithmeticSequence<FirstTypeDerived, ArithmeticSequence<FirstTypeDerived,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>, symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
Symbolic::ValueExpr<> >, symbolic::ValueExpr<> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type> >::type typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr) seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
{ {
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType; typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
} }
template<typename FirstType,typename LastTypeDerived, typename IncrType> template<typename FirstType,typename LastTypeDerived, typename IncrType>
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value, typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type, ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >, symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type> >::type typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr) seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
{ {
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType; typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(typename internal::cleanup_index_type<FirstType>::type(f), return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType> template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
ArithmeticSequence<FirstTypeDerived, ArithmeticSequence<FirstTypeDerived,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived, symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
Symbolic::NegateExpr<FirstTypeDerived> >, symbolic::NegateExpr<FirstTypeDerived> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >, symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type> typename internal::cleanup_seq_incr<IncrType>::type>
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr) seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
{ {
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType; typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
} }
#endif #endif // EIGEN_HAS_CXX11
#endif // EIGEN_PARSED_BY_DOXYGEN #endif // EIGEN_PARSED_BY_DOXYGEN
#if EIGEN_HAS_CXX11
/** \cpp11
* \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
*
* It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
*
* \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
template<typename SizeType,typename IncrType>
auto lastN(SizeType size, IncrType incr)
-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
{
return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
}
/** \cpp11
* \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
*
* It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
*
* \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
template<typename SizeType>
auto lastN(SizeType size)
-> decltype(seqN(Eigen::last+fix<1>()-size, size))
{
return seqN(Eigen::last+fix<1>()-size, size);
}
#endif
namespace internal { namespace internal {
// Convert a symbolic span into a usable one (i.e., remove last/end "keywords") // Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
template<typename T> template<typename T>
struct make_size_type { struct make_size_type {
typedef typename internal::conditional<Symbolic::is_symbolic<T>::value, Index, T>::type type; typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
}; };
template<typename FirstType,typename SizeType,typename IncrType,int XprSize> template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
@ -345,6 +375,39 @@ struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
} // end namespace internal } // end namespace internal
/** \namespace Eigen::indexing
* \ingroup Core_Module
*
* The sole purpose of this namespace is to be able to import all functions
* and symbols that are expected to be used within operator() for indexing
* and slicing. If you already imported the whole Eigen namespace:
* \code using namespace Eigen; \endcode
* then you are already all set. Otherwise, if you don't want/cannot import
* the whole Eigen namespace, the following line:
* \code using namespace Eigen::indexing; \endcode
* is equivalent to:
* \code
using Eigen::all;
using Eigen::seq;
using Eigen::seqN;
using Eigen::lastN; // c++11 only
using Eigen::last;
using Eigen::lastp1;
using Eigen::fix;
\endcode
*/
namespace indexing {
using Eigen::all;
using Eigen::seq;
using Eigen::seqN;
#if EIGEN_HAS_CXX11
using Eigen::lastN;
#endif
using Eigen::last;
using Eigen::lastp1;
using Eigen::fix;
}
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_ARITHMETIC_SEQUENCE_H #endif // EIGEN_ARITHMETIC_SEQUENCE_H

View File

@ -231,10 +231,16 @@ class Array
: Base(other) : Base(other)
{ } { }
private:
struct PrivateType {};
public:
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */ /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other) EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
PrivateType>::type = PrivateType())
: Base(other.derived()) : Base(other.derived())
{ } { }

View File

@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other) ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other) ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
// Let's remove NestByRefBit // Let's remove NestByRefBit
enum { enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags, Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
}; };
}; };
} }
@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline void evalTo(Dest& dst) const { dst = m_expression; } inline void evalTo(Dest& dst) const { dst = m_expression; }
const typename internal::remove_all<NestedExpressionType>::type&
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
const typename internal::remove_all<NestedExpressionType>::type&
nestedExpression() const nestedExpression() const
{ {
return m_expression; return m_expression;
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
// Let's remove NestByRefBit // Let's remove NestByRefBit
enum { enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags, Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
}; };
}; };
} }

View File

@ -16,7 +16,7 @@ namespace Eigen {
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
::lazyAssign(const DenseBase<OtherDerived>& other) ::lazyAssign(const DenseBase<OtherDerived>& other)
{ {
enum{ enum{

View File

@ -39,7 +39,7 @@ public:
enum { enum {
DstAlignment = DstEvaluator::Alignment, DstAlignment = DstEvaluator::Alignment,
SrcAlignment = SrcEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment,
DstHasDirectAccess = DstFlags & DirectAccessBit, DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
}; };
@ -83,7 +83,7 @@ private:
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll, /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */ so it's only good for large enough sizes. */
@ -97,7 +97,7 @@ private:
public: public:
enum { enum {
Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
// AssignmentKind must define a Kind typedef. // AssignmentKind must define a Kind typedef.
template<typename DstShape, typename SrcShape> struct AssignmentKind; template<typename DstShape, typename SrcShape> struct AssignmentKind;
// Assignement kind defined in this file: // Assignment kind defined in this file:
struct Dense2Dense {}; struct Dense2Dense {};
struct EigenBase2EigenBase {}; struct EigenBase2EigenBase {};
@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
src.evalTo(dst); src.evalTo(dst);
} }
// NOTE The following two functions are templated to avoid their instanciation if not needed // NOTE The following two functions are templated to avoid their instantiation if not needed
// This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
template<typename SrcScalarType> template<typename SrcScalarType>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC

View File

@ -84,7 +84,8 @@ class vml_assign_traits
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \ struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \ Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \ typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \ if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \ Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \ typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \ const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \ static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \ VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \ if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \

View File

@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
* \sa any(), Cwise::operator<() * \sa any(), Cwise::operator<()
*/ */
template<typename Derived> template<typename Derived>
inline bool DenseBase<Derived>::all() const EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
{ {
typedef internal::evaluator<Derived> Evaluator; typedef internal::evaluator<Derived> Evaluator;
enum { enum {
@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
* \sa all() * \sa all()
*/ */
template<typename Derived> template<typename Derived>
inline bool DenseBase<Derived>::any() const EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
{ {
typedef internal::evaluator<Derived> Evaluator; typedef internal::evaluator<Derived> Evaluator;
enum { enum {
@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
* \sa all(), any() * \sa all(), any()
*/ */
template<typename Derived> template<typename Derived>
inline Eigen::Index DenseBase<Derived>::count() const EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
{ {
return derived().template cast<bool>().template cast<Index>().sum(); return derived().template cast<bool>().template cast<Index>().sum();
} }

View File

@ -141,7 +141,7 @@ struct CommaInitializer
* \sa CommaInitializer::finished(), class CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer
*/ */
template<typename Derived> template<typename Derived>
inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s) EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
{ {
return CommaInitializer<Derived>(*static_cast<Derived*>(this), s); return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
} }
@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
/** \sa operator<<(const Scalar&) */ /** \sa operator<<(const Scalar&) */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
inline CommaInitializer<Derived> EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other) DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
{ {
return CommaInitializer<Derived>(*static_cast<Derived *>(this), other); return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);

View File

@ -134,19 +134,21 @@ private:
// this helper permits to completely eliminate m_outerStride if it is known at compiletime. // this helper permits to completely eliminate m_outerStride if it is known at compiletime.
template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data { template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
public: public:
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
{ {
EIGEN_ONLY_USED_FOR_DEBUG(outerStride); #ifndef EIGEN_INTERNAL_DEBUGGING
EIGEN_UNUSED_VARIABLE(outerStride);
#endif
eigen_internal_assert(outerStride==OuterStride); eigen_internal_assert(outerStride==OuterStride);
} }
Index outerStride() const { return OuterStride; } EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
const Scalar *data; const Scalar *data;
}; };
template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> { template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
public: public:
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
Index outerStride() const { return m_outerStride; } EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
const Scalar *data; const Scalar *data;
protected: protected:
Index m_outerStride; Index m_outerStride;
@ -1034,7 +1036,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
? int(outer_stride_at_compile_time<ArgType>::ret) ? int(outer_stride_at_compile_time<ArgType>::ret)
: int(inner_stride_at_compile_time<ArgType>::ret), : int(inner_stride_at_compile_time<ArgType>::ret),
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
FlagsRowMajorBit = XprType::Flags&RowMajorBit, FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@ -1044,7 +1046,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
PacketAlignment = unpacket_traits<PacketScalar>::alignment, PacketAlignment = unpacket_traits<PacketScalar>::alignment,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
&& (OuterStrideAtCompileTime!=0)
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0) Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
}; };
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type; typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@ -1075,14 +1079,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
: m_argImpl(block.nestedExpression()), : m_argImpl(block.nestedExpression()),
m_startRow(block.startRow()), m_startRow(block.startRow()),
m_startCol(block.startCol()) m_startCol(block.startCol()),
m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
{ } { }
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
enum { enum {
RowsAtCompileTime = XprType::RowsAtCompileTime RowsAtCompileTime = XprType::RowsAtCompileTime,
ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
}; };
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1094,7 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const CoeffReturnType coeff(Index index) const
{ {
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); if (ForwardLinearAccess)
return m_argImpl.coeff(m_linear_offset.value() + index);
else
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
} }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1106,7 +1115,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index) Scalar& coeffRef(Index index)
{ {
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); if (ForwardLinearAccess)
return m_argImpl.coeffRef(m_linear_offset.value() + index);
else
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
} }
template<int LoadMode, typename PacketType> template<int LoadMode, typename PacketType>
@ -1120,8 +1132,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
PacketType packet(Index index) const PacketType packet(Index index) const
{ {
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, if (ForwardLinearAccess)
RowsAtCompileTime == 1 ? index : 0); return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
else
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
} }
template<int StoreMode, typename PacketType> template<int StoreMode, typename PacketType>
@ -1135,15 +1150,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x) void writePacket(Index index, const PacketType& x)
{ {
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index, if (ForwardLinearAccess)
RowsAtCompileTime == 1 ? index : 0, return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
x); else
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
} }
protected: protected:
evaluator<ArgType> m_argImpl; evaluator<ArgType> m_argImpl;
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow; const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol; const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
}; };
// TODO: This evaluator does not actually use the child evaluator; // TODO: This evaluator does not actually use the child evaluator;

View File

@ -158,7 +158,7 @@ public:
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other) MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived & EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other) MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
{ {
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CWISE_BINARY_OP_H #endif // EIGEN_CWISE_BINARY_OP_H

View File

@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
*/ */
template<typename Derived> template<typename Derived>
template<typename CustomNullaryOp> template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
{ {
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func); return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
*/ */
template<typename Derived> template<typename Derived>
template<typename CustomNullaryOp> template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func) DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
*/ */
template<typename Derived> template<typename Derived>
template<typename CustomNullaryOp> template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func) DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
{ {
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func); return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
* \sa class CwiseNullaryOp * \sa class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value) DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
{ {
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value)); return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
* \sa class CwiseNullaryOp * \sa class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index size, const Scalar& value) DenseBase<Derived>::Constant(Index size, const Scalar& value)
{ {
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value)); return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
* \sa class CwiseNullaryOp * \sa class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(const Scalar& value) DenseBase<Derived>::Constant(const Scalar& value)
{ {
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
* \sa LinSpaced(Scalar,Scalar) * \sa LinSpaced(Scalar,Scalar)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
* Special version for fixed size types which does not require the size parameter. * Special version for fixed size types which does not require the size parameter.
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high) DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isApproxToConstant EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
(const Scalar& val, const RealScalar& prec) const (const Scalar& val, const RealScalar& prec) const
{ {
typename internal::nested_eval<Derived,1>::type self(derived()); typename internal::nested_eval<Derived,1>::type self(derived());
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
* *
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isConstant EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
(const Scalar& val, const RealScalar& prec) const (const Scalar& val, const RealScalar& prec) const
{ {
return isApproxToConstant(val, prec); return isApproxToConstant(val, prec);
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
* \sa setConstant(), Constant(), class CwiseNullaryOp * \sa setConstant(), Constant(), class CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
{ {
setConstant(val); setConstant(val);
} }
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
{ {
return derived() = Constant(rows(), cols(), val); return derived() = Constant(rows(), cols(), val);
} }
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val) PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
{ {
resize(size); resize(size);
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val) PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
{ {
resize(rows, cols); resize(rows, cols);
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize)); return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return setLinSpaced(size(), low, high); return setLinSpaced(size(), low, high);
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
* \sa Zero(), Zero(Index) * \sa Zero(), Zero(Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index rows, Index cols) DenseBase<Derived>::Zero(Index rows, Index cols)
{ {
return Constant(rows, cols, Scalar(0)); return Constant(rows, cols, Scalar(0));
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
* \sa Zero(), Zero(Index,Index) * \sa Zero(), Zero(Index,Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index size) DenseBase<Derived>::Zero(Index size)
{ {
return Constant(size, Scalar(0)); return Constant(size, Scalar(0));
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
* \sa Zero(Index), Zero(Index,Index) * \sa Zero(Index), Zero(Index,Index)
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero() DenseBase<Derived>::Zero()
{ {
return Constant(Scalar(0)); return Constant(Scalar(0));
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
* \sa class CwiseNullaryOp, Zero() * \sa class CwiseNullaryOp, Zero()
*/ */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isZero(const RealScalar& prec) const EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
{ {
typename internal::nested_eval<Derived,1>::type self(derived()); typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j) for(Index j = 0; j < cols(); ++j)
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
* \sa class CwiseNullaryOp, Zero() * \sa class CwiseNullaryOp, Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
{ {
return setConstant(Scalar(0)); return setConstant(Scalar(0));
} }
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index newSize) PlainObjectBase<Derived>::setZero(Index newSize)
{ {
resize(newSize); resize(newSize);
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index rows, Index cols) PlainObjectBase<Derived>::setZero(Index rows, Index cols)
{ {
resize(rows, cols); resize(rows, cols);
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
* \sa Ones(), Ones(Index), isOnes(), class Ones * \sa Ones(), Ones(Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index rows, Index cols) DenseBase<Derived>::Ones(Index rows, Index cols)
{ {
return Constant(rows, cols, Scalar(1)); return Constant(rows, cols, Scalar(1));
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index newSize) DenseBase<Derived>::Ones(Index newSize)
{ {
return Constant(newSize, Scalar(1)); return Constant(newSize, Scalar(1));
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones() DenseBase<Derived>::Ones()
{ {
return Constant(Scalar(1)); return Constant(Scalar(1));
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
* \sa class CwiseNullaryOp, Ones() * \sa class CwiseNullaryOp, Ones()
*/ */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isOnes EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
(const RealScalar& prec) const (const RealScalar& prec) const
{ {
return isApproxToConstant(Scalar(1), prec); return isApproxToConstant(Scalar(1), prec);
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
* \sa class CwiseNullaryOp, Ones() * \sa class CwiseNullaryOp, Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
{ {
return setConstant(Scalar(1)); return setConstant(Scalar(1));
} }
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index newSize) PlainObjectBase<Derived>::setOnes(Index newSize)
{ {
resize(newSize); resize(newSize);
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index rows, Index cols) PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
{ {
resize(rows, cols); resize(rows, cols);
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
* \sa Identity(), setIdentity(), isIdentity() * \sa Identity(), setIdentity(), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity(Index rows, Index cols) MatrixBase<Derived>::Identity(Index rows, Index cols)
{ {
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>()); return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
* \sa Identity(Index,Index), setIdentity(), isIdentity() * \sa Identity(Index,Index), setIdentity(), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity() MatrixBase<Derived>::Identity()
{ {
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
{ {
return internal::setIdentity_impl<Derived>::run(derived()); return internal::setIdentity_impl<Derived>::run(derived());
} }
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
{ {
derived().resize(rows, cols); derived().resize(rows, cols);
return setIdentity(); return setIdentity();
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
{ {
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(),i); return BasisReturnType(SquareMatrixType::Identity(),i);
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
{ return Derived::Unit(0); } { return Derived::Unit(0); }
/** \returns an expression of the Y axis unit vector (0,1{,0}^*) /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
{ return Derived::Unit(1); } { return Derived::Unit(1); }
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
{ return Derived::Unit(2); } { return Derived::Unit(2); }
/** \returns an expression of the W axis unit vector (0,0,0,1) /** \returns an expression of the W axis unit vector (0,0,0,1)
@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); } { return Derived::Unit(3); }
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
*
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<size());
derived().setZero();
derived().coeffRef(i) = Scalar(1);
return derived();
}
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
*
* \param newSize the new size of the vector
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<newSize);
derived().resize(newSize);
return setUnit(i);
}
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_CWISE_NULLARY_OP_H #endif // EIGEN_CWISE_NULLARY_OP_H

View File

@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
* we are dealing with a column-vector (if there is only one column) or with * we are dealing with a column-vector (if there is only one column) or with
* a row-vector (if there is only one row). */ * a row-vector (if there is only one row). */
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
/**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
* and 2 for matrices.
*/
Flags = internal::traits<Derived>::Flags, Flags = internal::traits<Derived>::Flags,
/**< This stores expression \ref flags flags which may or may not be inherited by new expressions /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
* constructed from this one. See the \ref flags "list of flags". * constructed from this one. See the \ref flags "list of flags".
@ -296,7 +301,7 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Derived& operator=(const ReturnByValue<OtherDerived>& func); Derived& operator=(const ReturnByValue<OtherDerived>& func);
/** \ínternal /** \internal
* Copies \a other into *this without evaluating other. \returns a reference to *this. * Copies \a other into *this without evaluating other. \returns a reference to *this.
* \deprecated */ * \deprecated */
template<typename OtherDerived> template<typename OtherDerived>
@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
* Notice that in the case of a plain matrix or vector (not an expression) this function just returns * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
* a const reference, in order to avoid a useless copy. * a const reference, in order to avoid a useless copy.
* *
* \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
*/ */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE EvalReturnType eval() const EIGEN_STRONG_INLINE EvalReturnType eval() const
@ -484,9 +489,9 @@ template<typename Derived> class DenseBase
return derived().coeff(0,0); return derived().coeff(0,0);
} }
bool all() const; EIGEN_DEVICE_FUNC bool all() const;
bool any() const; EIGEN_DEVICE_FUNC bool any() const;
Index count() const; EIGEN_DEVICE_FUNC Index count() const;
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType; typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType; typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;

View File

@ -61,7 +61,7 @@ struct plain_array
#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
#elif EIGEN_GNUC_AT_LEAST(4,7) #elif EIGEN_GNUC_AT_LEAST(4,7)
// GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
// See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
// Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
template<typename PtrType> template<typename PtrType>
@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(cols); EIGEN_UNUSED_VARIABLE(cols);
} }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data, other.m_data);
}
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
} }
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {} EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } {
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows() const {return m_rows;} EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols() const {return m_cols;} EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; } EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
return *this; return *this;
} }
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {} EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;} EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; } EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
return *this; return *this;
} }
EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {} EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;} EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
void conservativeResize(Index, Index, Index cols) { m_cols = cols; } EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
void resize(Index, Index, Index cols) { m_cols = cols; } EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; } EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
EIGEN_DEVICE_FUNC T *data() { return m_data.array; } EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
}; };
@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{ {
using std::swap; numext::swap(m_data, other.m_data);
swap(m_data, other.m_data); numext::swap(m_rows, other.m_rows);
swap(m_rows, other.m_rows); numext::swap(m_cols, other.m_cols);
swap(m_cols, other.m_cols);
return *this; return *this;
} }
#endif #endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } {
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
void conservativeResize(Index size, Index rows, Index cols) void conservativeResize(Index size, Index rows, Index cols)
@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{ {
using std::swap; numext::swap(m_data, other.m_data);
swap(m_data, other.m_data); numext::swap(m_cols, other.m_cols);
swap(m_cols, other.m_cols);
return *this; return *this;
} }
#endif #endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); } EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{ {
using std::swap; numext::swap(m_data, other.m_data);
swap(m_data, other.m_data); numext::swap(m_rows, other.m_rows);
swap(m_rows, other.m_rows);
return *this; return *this;
} }
#endif #endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); } EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
void conservativeResize(Index size, Index rows, Index) void conservativeResize(Index size, Index rows, Index)

View File

@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
{
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
* *
* \sa class Diagonal */ * \sa class Diagonal */
template<typename Derived> template<typename Derived>
inline typename MatrixBase<Derived>::DiagonalReturnType EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
MatrixBase<Derived>::diagonal() MatrixBase<Derived>::diagonal()
{ {
return DiagonalReturnType(derived()); return DiagonalReturnType(derived());
@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()
/** This is the const version of diagonal(). */ /** This is the const version of diagonal(). */
template<typename Derived> template<typename Derived>
inline typename MatrixBase<Derived>::ConstDiagonalReturnType EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
MatrixBase<Derived>::diagonal() const MatrixBase<Derived>::diagonal() const
{ {
return ConstDiagonalReturnType(derived()); return ConstDiagonalReturnType(derived());
@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
* *
* \sa MatrixBase::diagonal(), class Diagonal */ * \sa MatrixBase::diagonal(), class Diagonal */
template<typename Derived> template<typename Derived>
inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
MatrixBase<Derived>::diagonal(Index index) MatrixBase<Derived>::diagonal(Index index)
{ {
return DiagonalDynamicIndexReturnType(derived(), index); return DiagonalDynamicIndexReturnType(derived(), index);
@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
/** This is the const version of diagonal(Index). */ /** This is the const version of diagonal(Index). */
template<typename Derived> template<typename Derived>
inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
MatrixBase<Derived>::diagonal(Index index) const MatrixBase<Derived>::diagonal(Index index) const
{ {
return ConstDiagonalDynamicIndexReturnType(derived(), index); return ConstDiagonalDynamicIndexReturnType(derived(), index);
@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
* \sa MatrixBase::diagonal(), class Diagonal */ * \sa MatrixBase::diagonal(), class Diagonal */
template<typename Derived> template<typename Derived>
template<int Index_> template<int Index_>
EIGEN_DEVICE_FUNC
inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
MatrixBase<Derived>::diagonal() MatrixBase<Derived>::diagonal()
{ {
@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
/** This is the const version of diagonal<int>(). */ /** This is the const version of diagonal<int>(). */
template<typename Derived> template<typename Derived>
template<int Index_> template<int Index_>
EIGEN_DEVICE_FUNC
inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
MatrixBase<Derived>::diagonal() const MatrixBase<Derived>::diagonal() const
{ {

View File

@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
DenseMatrixType toDenseMatrix() const { return derived(); } DenseMatrixType toDenseMatrix() const { return derived(); }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
@ -273,7 +273,7 @@ class DiagonalWrapper
* \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
**/ **/
template<typename Derived> template<typename Derived>
inline const DiagonalWrapper<const Derived> EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
MatrixBase<Derived>::asDiagonal() const MatrixBase<Derived>::asDiagonal() const
{ {
return DiagonalWrapper<const Derived>(derived()); return DiagonalWrapper<const Derived>(derived());

View File

@ -17,7 +17,7 @@ namespace Eigen {
*/ */
template<typename Derived> template<typename Derived>
template<typename DiagonalDerived> template<typename DiagonalDerived>
inline const Product<Derived, DiagonalDerived, LazyProduct> EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
{ {
return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived()); return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());

View File

@ -31,7 +31,8 @@ struct dot_nocheck
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod; typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar; typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{ {
return a.template binaryExpr<conj_prod>(b).sum(); return a.template binaryExpr<conj_prod>(b).sum();
} }
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod; typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar; typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b) EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{ {
return a.transpose().template binaryExpr<conj_prod>(b).sum(); return a.transpose().template binaryExpr<conj_prod>(b).sum();
} }
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
{ {
@ -90,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
* \sa dot(), norm(), lpNorm() * \sa dot(), norm(), lpNorm()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
{ {
return numext::real((*this).cwiseAbs2().sum()); return numext::real((*this).cwiseAbs2().sum());
} }
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
* \sa lpNorm(), dot(), squaredNorm() * \sa lpNorm(), dot(), squaredNorm()
*/ */
template<typename Derived> template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
{ {
return numext::sqrt(squaredNorm()); return numext::sqrt(squaredNorm());
} }
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
* \sa norm(), normalize() * \sa norm(), normalize()
*/ */
template<typename Derived> template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::normalized() const MatrixBase<Derived>::normalized() const
{ {
typedef typename internal::nested_eval<Derived,2>::type _Nested; typedef typename internal::nested_eval<Derived,2>::type _Nested;
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
* \sa norm(), normalized() * \sa norm(), normalized()
*/ */
template<typename Derived> template<typename Derived>
inline void MatrixBase<Derived>::normalize() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
{ {
RealScalar z = squaredNorm(); RealScalar z = squaredNorm();
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
* \sa stableNorm(), stableNormalize(), normalized() * \sa stableNorm(), stableNormalize(), normalized()
*/ */
template<typename Derived> template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::stableNormalized() const MatrixBase<Derived>::stableNormalized() const
{ {
typedef typename internal::nested_eval<Derived,3>::type _Nested; typedef typename internal::nested_eval<Derived,3>::type _Nested;
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
* \sa stableNorm(), stableNormalized(), normalize() * \sa stableNorm(), stableNormalized(), normalize()
*/ */
template<typename Derived> template<typename Derived>
inline void MatrixBase<Derived>::stableNormalize() EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
{ {
RealScalar w = cwiseAbs().maxCoeff(); RealScalar w = cwiseAbs().maxCoeff();
RealScalar z = (derived()/w).squaredNorm(); RealScalar z = (derived()/w).squaredNorm();
@ -257,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
template<typename Derived> template<typename Derived>
template<int p> template<int p>
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
#else #else
MatrixBase<Derived>::RealScalar EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
#endif #endif
MatrixBase<Derived>::lpNorm() const MatrixBase<Derived>::lpNorm() const
{ {

View File

@ -14,6 +14,7 @@
namespace Eigen { namespace Eigen {
/** \class EigenBase /** \class EigenBase
* \ingroup Core_Module
* *
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
* *
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived()); call_assignment(derived(), other.derived());
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other) Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
{ {
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>()); call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
bool DenseBase<Derived>::isApprox( EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
const DenseBase<OtherDerived>& other, const DenseBase<OtherDerived>& other,
const RealScalar& prec const RealScalar& prec
) const ) const
@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
* \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
*/ */
template<typename Derived> template<typename Derived>
bool DenseBase<Derived>::isMuchSmallerThan( EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
const typename NumTraits<Scalar>::Real& other, const typename NumTraits<Scalar>::Real& other,
const RealScalar& prec const RealScalar& prec
) const ) const
@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
*/ */
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
bool DenseBase<Derived>::isMuchSmallerThan( EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
const DenseBase<OtherDerived>& other, const DenseBase<OtherDerived>& other,
const RealScalar& prec const RealScalar& prec
) const ) const

View File

@ -18,18 +18,33 @@ enum {
Small = 3 Small = 3
}; };
// Define the threshold value to fallback from the generic matrix-matrix product
// implementation (heavy) to the lightweight coeff-based product one.
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
// in products/GeneralMatrixMatrix.h for more details.
// TODO This threshold should also be used in the compile-time selector below.
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
// This default value has been obtained on a Haswell architecture.
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
#endif
namespace internal { namespace internal {
template<int Rows, int Cols, int Depth> struct product_type_selector; template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category template<int Size, int MaxSize> struct product_size_category
{ {
enum { is_large = MaxSize == Dynamic || enum {
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || #ifndef EIGEN_GPU_COMPILE_PHASE
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), is_large = MaxSize == Dynamic ||
value = is_large ? Large Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
: Size == 1 ? 1 (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
: Small #else
is_large = 0,
#endif
value = is_large ? Large
: Size == 1 ? 1
: Small
}; };
}; };
@ -148,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
template<typename Scalar,int Size,int MaxSize> template<typename Scalar,int Size,int MaxSize>
struct gemv_static_vector_if<Scalar,Size,MaxSize,false> struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
{ {
EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
}; };
template<typename Scalar,int Size> template<typename Scalar,int Size>
struct gemv_static_vector_if<Scalar,Size,Dynamic,true> struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
{ {
EIGEN_STRONG_INLINE Scalar* data() { return 0; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
}; };
template<typename Scalar,int Size,int MaxSize> template<typename Scalar,int Size,int MaxSize>
@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
* *
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/ */
#ifndef __CUDACC__
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC
inline const Product<Derived, OtherDerived> inline const Product<Derived, OtherDerived>
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
{ {
@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived()); return Product<Derived, OtherDerived>(derived(), other.derived());
} }
#endif // __CUDACC__
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
* *
* The returned product will behave like any other expressions: the coefficients of the product will be * The returned product will behave like any other expressions: the coefficients of the product will be
@ -428,7 +440,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
const Product<Derived,OtherDerived,LazyProduct> const Product<Derived,OtherDerived,LazyProduct>
MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
{ {
enum { enum {
ProductIsValid = Derived::ColsAtCompileTime==Dynamic ProductIsValid = Derived::ColsAtCompileTime==Dynamic

View File

@ -82,7 +82,11 @@ struct default_packet_traits
HasPolygamma = 0, HasPolygamma = 0,
HasErf = 0, HasErf = 0,
HasErfc = 0, HasErfc = 0,
HasI0e = 0,
HasI1e = 0,
HasIGamma = 0, HasIGamma = 0,
HasIGammaDerA = 0,
HasGammaSampleDerAlpha = 0,
HasIGammac = 0, HasIGammac = 0,
HasBetaInc = 0, HasBetaInc = 0,
@ -231,7 +235,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
* Currently, this function is only used for scalar * complex products. * Currently, this function is only used for scalar * complex products.
*/ */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with elements of \a *from quadrupled. /** \internal \returns a packet with elements of \a *from quadrupled.
@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
} }
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Packet> inline Packet template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; } plset(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
/** \internal tries to do cache prefetching of \a addr */ /** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{ {
#ifdef __CUDA_ARCH__ #if defined(EIGEN_HIP_DEVICE_COMPILE)
// do nothing
#elif defined(EIGEN_CUDA_ARCH)
#if defined(__LP64__) #if defined(__LP64__)
// 64-bit pointer operand constraint for inlined asm // 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a) template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
{ return a; } { return a; }
/** \internal \returns the sum of the elements of \a a by block of 4 elements. /** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
* For packet-size smaller or equal to 4, this boils down to a noop. * For packet-size smaller or equal to 4, this boils down to a noop.
*/ */
template<typename Packet> EIGEN_DEVICE_FUNC inline template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux_downto4(const Packet& a) predux_half_dowto4(const Packet& a)
{ return a; } { return a; }
/** \internal \returns the product of the elements of \a a*/ /** \internal \returns the product of the elements of \a a*/
@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
* by the current computation. * by the current computation.
*/ */
template<typename Packet, int LoadMode> template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{ {
return ploadt<Packet, LoadMode>(from); return ploadt<Packet, LoadMode>(from);
} }
@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/ ***************************************************************************/
// Eigen+CUDA does not support complexes. // Eigen+CUDA does not support complexes.
#ifndef __CUDACC__ #if !defined(EIGEN_GPUCC)
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

View File

@ -66,6 +66,7 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
@ -89,7 +90,7 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
/** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
* *
* \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@ -103,17 +104,18 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> > inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent); pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
#else #else
template<typename Derived,typename ScalarExponent> template <typename Derived,typename ScalarExponent>
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), EIGEN_DEVICE_FUNC inline
const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) { const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
return x.derived().pow(exponent); EIGEN_COMMA ScalarExponent EIGEN_COMMA
} EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
template<typename Derived> {
inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) { EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
return x.derived().pow(exponent); return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
} }
#endif #endif
@ -123,21 +125,21 @@ namespace Eigen
* *
* Example: \include Cwise_array_power_array.cpp * Example: \include Cwise_array_power_array.cpp
* Output: \verbinclude Cwise_array_power_array.out * Output: \verbinclude Cwise_array_power_array.out
* *
* \sa ArrayBase::pow() * \sa ArrayBase::pow()
* *
* \relates ArrayBase * \relates ArrayBase
*/ */
template<typename Derived,typename ExponentDerived> template<typename Derived,typename ExponentDerived>
inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived> inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
{ {
return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>( return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
x.derived(), x.derived(),
exponents.derived() exponents.derived()
); );
} }
/** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
* *
* This function computes the coefficient-wise power between a scalar and an array of exponents. * This function computes the coefficient-wise power between a scalar and an array of exponents.
@ -146,7 +148,7 @@ namespace Eigen
* *
* Example: \include Cwise_scalar_power_array.cpp * Example: \include Cwise_scalar_power_array.cpp
* Output: \verbinclude Cwise_scalar_power_array.out * Output: \verbinclude Cwise_scalar_power_array.out
* *
* \sa ArrayBase::pow() * \sa ArrayBase::pow()
* *
* \relates ArrayBase * \relates ArrayBase
@ -156,21 +158,17 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived> inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x); pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
#else #else
template<typename Scalar, typename Derived> template <typename Scalar, typename Derived>
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), EIGEN_DEVICE_FUNC inline
const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
{ EIGEN_COMMA Scalar EIGEN_COMMA
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
} typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
template<typename Derived> return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
{
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
} }
#endif #endif

View File

@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{ {
typedef traits<PlainObjectType> TraitsBase; typedef traits<PlainObjectType> TraitsBase;
enum { enum {
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
? PlainObjectType::ColsAtCompileTime
: PlainObjectType::RowsAtCompileTime,
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime) ? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime), : int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
? int(PlainObjectType::OuterStrideAtCompileTime) ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
? Dynamic
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime), : int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask), Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit), Flags0 = TraitsBase::Flags & (~NestByRefBit),
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
inline Index outerStride() const inline Index outerStride() const
{ {
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size() : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
: int(Flags)&RowMajorBit ? this->cols() : IsVectorAtCompileTime ? (this->size() * innerStride())
: this->rows(); : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
: (this->rows() * innerStride());
} }
/** Constructor in the fixed-size case. /** Constructor in the fixed-size case.

View File

@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
enum { enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime, RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime, ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
SizeAtCompileTime = Base::SizeAtCompileTime SizeAtCompileTime = Base::SizeAtCompileTime
}; };
@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
{ {
#if EIGEN_MAX_ALIGN_BYTES>0 #if EIGEN_MAX_ALIGN_BYTES>0
// innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0) eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
|| (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned"); || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
#endif #endif
} }

View File

@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {}; template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__ #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> template<typename T>
struct real_impl<std::complex<T> > struct real_impl<std::complex<T> >
{ {
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {}; template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__ #if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T> template<typename T>
struct imag_impl<std::complex<T> > struct imag_impl<std::complex<T> >
{ {
@ -238,7 +238,7 @@ struct imag_ref_retval
****************************************************************************/ ****************************************************************************/
template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex> template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct conj_impl struct conj_default_impl
{ {
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) static inline Scalar run(const Scalar& x)
@ -248,7 +248,7 @@ struct conj_impl
}; };
template<typename Scalar> template<typename Scalar>
struct conj_impl<Scalar,true> struct conj_default_impl<Scalar,true>
{ {
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x) static inline Scalar run(const Scalar& x)
@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
} }
}; };
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
#if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T>
struct conj_impl<std::complex<T> >
{
EIGEN_DEVICE_FUNC
static inline std::complex<T> run(const std::complex<T>& x)
{
return std::complex<T>(x.real(), -x.imag());
}
};
#endif
template<typename Scalar> template<typename Scalar>
struct conj_retval struct conj_retval
{ {
@ -347,31 +361,7 @@ struct norm1_retval
* Implementation of hypot * * Implementation of hypot *
****************************************************************************/ ****************************************************************************/
template<typename Scalar> template<typename Scalar> struct hypot_impl;
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
EIGEN_USING_STD_MATH(sqrt);
RealScalar _x = abs(x);
RealScalar _y = abs(y);
Scalar p, qp;
if(_x>_y)
{
p = _x;
qp = _y / p;
}
else
{
p = _y;
qp = _x / p;
}
if(p==RealScalar(0)) return RealScalar(0);
return p * sqrt(RealScalar(1) + qp*qp);
}
};
template<typename Scalar> template<typename Scalar>
struct hypot_retval struct hypot_retval
@ -445,7 +435,12 @@ struct round_retval
struct arg_impl { struct arg_impl {
static inline Scalar run(const Scalar& x) static inline Scalar run(const Scalar& x)
{ {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIP does not seem to have a native device side implementation for the math routine "arg"
using std::arg;
#else
EIGEN_USING_STD_MATH(arg); EIGEN_USING_STD_MATH(arg);
#endif
return arg(x); return arg(x);
} }
}; };
@ -497,11 +492,11 @@ namespace std_fallback {
EIGEN_USING_STD_MATH(exp); EIGEN_USING_STD_MATH(exp);
Scalar u = exp(x); Scalar u = exp(x);
if (u == Scalar(1)) { if (numext::equal_strict(u, Scalar(1))) {
return x; return x;
} }
Scalar um1 = u - RealScalar(1); Scalar um1 = u - RealScalar(1);
if (um1 == Scalar(-1)) { if (numext::equal_strict(um1, Scalar(-1))) {
return RealScalar(-1); return RealScalar(-1);
} }
@ -512,7 +507,7 @@ namespace std_fallback {
template<typename Scalar> template<typename Scalar>
struct expm1_impl { struct expm1_impl {
static inline Scalar run(const Scalar& x) EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{ {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH #if EIGEN_HAS_CXX11_MATH
@ -543,13 +538,13 @@ namespace std_fallback {
typedef typename NumTraits<Scalar>::Real RealScalar; typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(log); EIGEN_USING_STD_MATH(log);
Scalar x1p = RealScalar(1) + x; Scalar x1p = RealScalar(1) + x;
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
} }
} }
template<typename Scalar> template<typename Scalar>
struct log1p_impl { struct log1p_impl {
static inline Scalar run(const Scalar& x) EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{ {
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH #if EIGEN_HAS_CXX11_MATH
@ -689,20 +684,27 @@ struct random_default_impl<Scalar, false, true>
{ {
static inline Scalar run(const Scalar& x, const Scalar& y) static inline Scalar run(const Scalar& x, const Scalar& y)
{ {
typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; if (y <= x)
if(y<x)
return x; return x;
// the following difference might overflow on a 32 bits system, // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
// but since y>=x the result converted to an unsigned long is still correct. typedef typename make_unsigned<Scalar>::type ScalarU;
std::size_t range = ScalarX(y)-ScalarX(x); // ScalarX is the widest of ScalarU and unsigned int.
std::size_t offset = 0; // We'll deal only with ScalarX and unsigned int below thus avoiding signed
// rejection sampling // types and arithmetic and signed overflows (which are undefined behavior).
std::size_t divisor = 1; typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
std::size_t multiplier = 1; // The following difference doesn't overflow, provided our integer types are two's
if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1); // complement and have the same number of padding bits in signed and unsigned variants.
else multiplier = 1 + range/(std::size_t(RAND_MAX)+1); // This is the case in most modern implementations of C++.
ScalarX range = ScalarX(y) - ScalarX(x);
ScalarX offset = 0;
ScalarX divisor = 1;
ScalarX multiplier = 1;
const unsigned rand_max = RAND_MAX;
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
else multiplier = 1 + range / (rand_max + 1);
// Rejection sampling.
do { do {
offset = (std::size_t(std::rand()) * multiplier) / divisor; offset = (unsigned(std::rand()) * multiplier) / divisor;
} while (offset > range); } while (offset > range);
return Scalar(ScalarX(x) + offset); return Scalar(ScalarX(x) + offset);
} }
@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
} }
// Implementatin of is* functions // Implementation of is* functions
// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x) isfinite_impl(const T& x)
{ {
#ifdef __CUDA_ARCH__ #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isfinite)(x); return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite; using std::isfinite;
@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x) isinf_impl(const T& x)
{ {
#ifdef __CUDA_ARCH__ #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isinf)(x); return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf; using std::isinf;
@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x) isnan_impl(const T& x)
{ {
#ifdef __CUDA_ARCH__ #if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isnan)(x); return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY #elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan; using std::isnan;
@ -874,7 +876,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext { namespace numext {
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) #if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -890,84 +892,6 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
EIGEN_USING_STD_MATH(max); EIGEN_USING_STD_MATH(max);
return max EIGEN_NOT_A_MACRO (x,y); return max EIGEN_NOT_A_MACRO (x,y);
} }
#elif defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
{
return y < x ? y : x;
}
template<typename T>
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
{
return x < y ? y : x;
}
EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE long mini(const long & x, const long & y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE long maxi(const long & x, const long & y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
{
return cl::sycl::fmin(x,y);
}
EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
{
return cl::sycl::fmax(x,y);
}
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
{
return cl::sycl::fmin(x,y);
}
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
{
return cl::sycl::fmax(x,y);
}
#else #else
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
@ -981,6 +905,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
{ {
return fminf(x, y); return fminf(x, y);
} }
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
{
return fmin(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
{
#if defined(EIGEN_HIPCC)
// no "fminl" on HIP yet
return (x < y) ? x : y;
#else
return fminl(x, y);
#endif
}
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@ -993,7 +935,93 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
{ {
return fmaxf(x, y); return fmaxf(x, y);
} }
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
{
return fmax(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
{
#if defined(EIGEN_HIPCC)
// no "fmaxl" on HIP yet
return (x > y) ? x : y;
#else
return fmaxl(x, y);
#endif #endif
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
template<> \
EIGEN_DEVICE_FUNC \
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
return cl::sycl::FUNC(x); \
}
#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
template<> \
EIGEN_DEVICE_FUNC \
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
return cl::sycl::FUNC(x, y); \
}
#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar> template<typename Scalar>
@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
} }
EIGEN_DEVICE_FUNC
inline bool abs2(bool x) { return x; }
template<typename Scalar> template<typename Scalar>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
} }
#if defined(__SYCL_DEVICE_ONLY__)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar> template<typename Scalar>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif //defined(__SYCL_DEVICE_ONLY__)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); } float log1p(const float &x) { return ::log1pf(x); }
@ -1101,8 +1135,7 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float pow(float x, float y) { return cl::sycl::pow(x, y); } SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
EIGEN_ALWAYS_INLINE double pow(double x, double y) { return cl::sycl::pow(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T> EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } template<typename T> EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); }
@ -1110,12 +1143,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return inte
template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float isnan(float x) { return cl::sycl::isnan(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
EIGEN_ALWAYS_INLINE double isnan(double x) { return cl::sycl::isnan(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
EIGEN_ALWAYS_INLINE float isinf(float x) { return cl::sycl::isinf(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
EIGEN_ALWAYS_INLINE double isinf(double x) { return cl::sycl::isinf(x); }
EIGEN_ALWAYS_INLINE float isfinite(float x) { return cl::sycl::isfinite(x); }
EIGEN_ALWAYS_INLINE double isfinite(double x) { return cl::sycl::isfinite(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar> template<typename Scalar>
@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float round(float x) { return cl::sycl::round(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
EIGEN_ALWAYS_INLINE double round(double x) { return cl::sycl::round(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T> template<typename T>
@ -1139,11 +1168,10 @@ T (floor)(const T& x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); } float floor(const float &x) { return ::floorf(x); }
@ -1160,11 +1188,10 @@ T (ceil)(const T& x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); } float ceil(const float &x) { return ::ceilf(x); }
@ -1205,8 +1232,7 @@ T sqrt(const T &x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sqrt(float x) { return cl::sycl::sqrt(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
EIGEN_ALWAYS_INLINE double sqrt(double x) { return cl::sycl::sqrt(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T> template<typename T>
@ -1217,12 +1243,11 @@ T log(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float log(float x) { return cl::sycl::log(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); } float log(const float &x) { return ::logf(x); }
@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); }
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename NumTraits<T>::Real abs(const T &x) { typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
abs(const T &x) {
EIGEN_USING_STD_MATH(abs); EIGEN_USING_STD_MATH(abs);
return abs(x); return abs(x);
} }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
abs(const T &x) {
return x;
}
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); } float abs(const float &x) { return ::fabsf(x); }
@ -1268,16 +1301,31 @@ T exp(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); } float exp(const float &x) { return ::expf(x); }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double exp(const double &x) { return ::exp(x); } double exp(const double &x) { return ::exp(x); }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
std::complex<float> exp(const std::complex<float>& x) {
float com = ::expf(x.real());
float res_real = com * ::cosf(x.imag());
float res_imag = com * ::sinf(x.imag());
return std::complex<float>(res_real, res_imag);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
std::complex<double> exp(const std::complex<double>& x) {
double com = ::exp(x.real());
double res_real = com * ::cos(x.imag());
double res_imag = com * ::sin(x.imag());
return std::complex<double>(res_real, res_imag);
}
#endif #endif
template<typename Scalar> template<typename Scalar>
@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); } float expm1(const float &x) { return ::expm1f(x); }
@ -1308,11 +1355,10 @@ T cos(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); } float cos(const float &x) { return ::cosf(x); }
@ -1328,11 +1374,10 @@ T sin(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); } float sin(const float &x) { return ::sinf(x); }
@ -1348,11 +1393,10 @@ T tan(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); } float tan(const float &x) { return ::tanf(x); }
@ -1367,12 +1411,21 @@ T acos(const T &x) {
return acos(x); return acos(x);
} }
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T acosh(const T &x) {
EIGEN_USING_STD_MATH(acosh);
return acosh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); } float acos(const float &x) { return ::acosf(x); }
@ -1387,12 +1440,21 @@ T asin(const T &x) {
return asin(x); return asin(x);
} }
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T asinh(const T &x) {
EIGEN_USING_STD_MATH(asinh);
return asinh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); } float asin(const float &x) { return ::asinf(x); }
@ -1407,12 +1469,21 @@ T atan(const T &x) {
return atan(x); return atan(x);
} }
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T atanh(const T &x) {
EIGEN_USING_STD_MATH(atanh);
return atanh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); } float atan(const float &x) { return ::atanf(x); }
@ -1429,11 +1500,10 @@ T cosh(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); } float cosh(const float &x) { return ::coshf(x); }
@ -1449,11 +1519,10 @@ T sinh(const T &x) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); } float sinh(const float &x) { return ::sinhf(x); }
@ -1468,15 +1537,16 @@ T tanh(const T &x) {
return tanh(x); return tanh(x);
} }
#if defined(__SYCL_DEVICE_ONLY__) #if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__))
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); } float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif #endif
#ifdef __CUDACC__ #if defined(__SYCL_DEVICE_ONLY__)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); } float tanh(const float &x) { return ::tanhf(x); }
@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) {
} }
#if defined(__SYCL_DEVICE_ONLY__) #if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y); } SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__) #endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__ #if defined(EIGEN_GPUCC)
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) { float fmod(const float& a, const float& b) {
@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) {
} }
#endif #endif
#if defined(__SYCL_DEVICE_ONLY__)
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
#undef SYCL_SPECIALIZE_UNARY_FUNC
#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
#undef SYCL_SPECIALIZE_BINARY_FUNC
#endif // defined(__SYCL_DEVICE_ONLY__)
} // end namespace numext } // end namespace numext
namespace internal { namespace internal {

View File

@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
return pdiv(p, q); return pdiv(p, q);
} }
template<typename RealScalar>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
{
EIGEN_USING_STD_MATH(sqrt);
RealScalar p, qp;
p = numext::maxi(x,y);
if(p==RealScalar(0)) return RealScalar(0);
qp = numext::mini(y,x) / p;
return p * sqrt(RealScalar(1) + qp*qp);
}
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static EIGEN_DEVICE_FUNC
inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
return positive_real_hypot<RealScalar>(abs(x), abs(y));
}
};
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other); Derived& operator-=(const MatrixBase<OtherDerived>& other);
#ifdef __CUDACC__
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
operator*(const MatrixBase<OtherDerived> &other) const
{ return this->lazyProduct(other); }
#else
template<typename OtherDerived>
const Product<Derived,OtherDerived> const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const; operator*(const MatrixBase<OtherDerived> &other) const;
#endif
template<typename OtherDerived> template<typename OtherDerived>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct> const Product<Derived,OtherDerived,LazyProduct>
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
Derived& setIdentity(); Derived& setIdentity();
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
Derived& setIdentity(Index rows, Index cols); Derived& setIdentity(Index rows, Index cols);
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const; bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@ -294,7 +287,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox() * fuzzy comparison such as isApprox()
* \sa isApprox(), operator!= */ * \sa isApprox(), operator!= */
template<typename OtherDerived> template<typename OtherDerived>
inline bool operator==(const MatrixBase<OtherDerived>& other) const EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
{ return cwiseEqual(other).all(); } { return cwiseEqual(other).all(); }
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@ -302,10 +295,10 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox() * fuzzy comparison such as isApprox()
* \sa isApprox(), operator== */ * \sa isApprox(), operator== */
template<typename OtherDerived> template<typename OtherDerived>
inline bool operator!=(const MatrixBase<OtherDerived>& other) const EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); } { return cwiseNotEqual(other).any(); }
NoAlias<Derived,Eigen::MatrixBase > noalias(); NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
// TODO forceAlignedAccess is temporarily disabled // TODO forceAlignedAccess is temporarily disabled
// Need to find a nicer workaround. // Need to find a nicer workaround.
@ -335,6 +328,7 @@ template<typename Derived> class MatrixBase
inline const PartialPivLU<PlainObject> lu() const; inline const PartialPivLU<PlainObject> lu() const;
EIGEN_DEVICE_FUNC
inline const Inverse<Derived> inverse() const; inline const Inverse<Derived> inverse() const;
template<typename ResultType> template<typename ResultType>
@ -344,12 +338,15 @@ template<typename Derived> class MatrixBase
bool& invertible, bool& invertible,
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision() const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
) const; ) const;
template<typename ResultType> template<typename ResultType>
inline void computeInverseWithCheck( inline void computeInverseWithCheck(
ResultType& inverse, ResultType& inverse,
bool& invertible, bool& invertible,
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision() const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
) const; ) const;
EIGEN_DEVICE_FUNC
Scalar determinant() const; Scalar determinant() const;
/////////// Cholesky module /////////// /////////// Cholesky module ///////////
@ -421,15 +418,19 @@ template<typename Derived> class MatrixBase
////////// Householder module /////////// ////////// Householder module ///////////
EIGEN_DEVICE_FUNC
void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
template<typename EssentialPart> template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void makeHouseholder(EssentialPart& essential, void makeHouseholder(EssentialPart& essential,
Scalar& tau, RealScalar& beta) const; Scalar& tau, RealScalar& beta) const;
template<typename EssentialPart> template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void applyHouseholderOnTheLeft(const EssentialPart& essential, void applyHouseholderOnTheLeft(const EssentialPart& essential,
const Scalar& tau, const Scalar& tau,
Scalar* workspace); Scalar* workspace);
template<typename EssentialPart> template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void applyHouseholderOnTheRight(const EssentialPart& essential, void applyHouseholderOnTheRight(const EssentialPart& essential,
const Scalar& tau, const Scalar& tau,
Scalar* workspace); Scalar* workspace);
@ -437,8 +438,10 @@ template<typename Derived> class MatrixBase
///////// Jacobi module ///////// ///////// Jacobi module /////////
template<typename OtherScalar> template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j); void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
template<typename OtherScalar> template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j); void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
///////// SparseCore module ///////// ///////// SparseCore module /////////

View File

@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
} }
template<int LoadMode> template<int LoadMode>
inline const PacketScalar packet(Index row, Index col) const EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
{ {
return m_expression.template packet<LoadMode>(row, col); return m_expression.template packet<LoadMode>(row, col);
} }
template<int LoadMode> template<int LoadMode>
inline void writePacket(Index row, Index col, const PacketScalar& x) EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
{ {
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x); m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
} }
template<int LoadMode> template<int LoadMode>
inline const PacketScalar packet(Index index) const EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
{ {
return m_expression.template packet<LoadMode>(index); return m_expression.template packet<LoadMode>(index);
} }
template<int LoadMode> template<int LoadMode>
inline void writePacket(Index index, const PacketScalar& x) EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
{ {
m_expression.const_cast_derived().template writePacket<LoadMode>(index, x); m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
} }
@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
/** \returns an expression of the temporary version of *this. /** \returns an expression of the temporary version of *this.
*/ */
template<typename Derived> template<typename Derived>
inline const NestByValue<Derived> EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
DenseBase<Derived>::nestByValue() const DenseBase<Derived>::nestByValue() const
{ {
return NestByValue<Derived>(derived()); return NestByValue<Derived>(derived());

View File

@ -33,6 +33,7 @@ class NoAlias
public: public:
typedef typename ExpressionType::Scalar Scalar; typedef typename ExpressionType::Scalar Scalar;
EIGEN_DEVICE_FUNC
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
template<typename OtherDerived> template<typename OtherDerived>
@ -74,10 +75,10 @@ class NoAlias
* *
* More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
* Currently, even though several expressions may alias, only product * Currently, even though several expressions may alias, only product
* expressions have this flag. Therefore, noalias() is only usefull when * expressions have this flag. Therefore, noalias() is only useful when
* the source expression contains a matrix product. * the source expression contains a matrix product.
* *
* Here are some examples where noalias is usefull: * Here are some examples where noalias is useful:
* \code * \code
* D.noalias() = A * B; * D.noalias() = A * B;
* D.noalias() += A.transpose() * B; * D.noalias() += A.transpose() * B;
@ -98,7 +99,7 @@ class NoAlias
* \sa class NoAlias * \sa class NoAlias
*/ */
template<typename Derived> template<typename Derived>
NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias() NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
{ {
return NoAlias<Derived, Eigen::MatrixBase >(derived()); return NoAlias<Derived, Eigen::MatrixBase >(derived());
} }

View File

@ -21,12 +21,14 @@ template< typename T,
bool is_integer = NumTraits<T>::IsInteger> bool is_integer = NumTraits<T>::IsInteger>
struct default_digits10_impl struct default_digits10_impl
{ {
EIGEN_DEVICE_FUNC
static int run() { return std::numeric_limits<T>::digits10; } static int run() { return std::numeric_limits<T>::digits10; }
}; };
template<typename T> template<typename T>
struct default_digits10_impl<T,false,false> // Floating point struct default_digits10_impl<T,false,false> // Floating point
{ {
EIGEN_DEVICE_FUNC
static int run() { static int run() {
using std::log10; using std::log10;
using std::ceil; using std::ceil;
@ -38,6 +40,38 @@ struct default_digits10_impl<T,false,false> // Floating point
template<typename T> template<typename T>
struct default_digits10_impl<T,false,true> // Integer struct default_digits10_impl<T,false,true> // Integer
{ {
EIGEN_DEVICE_FUNC
static int run() { return 0; }
};
// default implementation of digits(), based on numeric_limits if specialized,
// 0 for integer types, and log2(epsilon()) otherwise.
template< typename T,
bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
bool is_integer = NumTraits<T>::IsInteger>
struct default_digits_impl
{
EIGEN_DEVICE_FUNC
static int run() { return std::numeric_limits<T>::digits; }
};
template<typename T>
struct default_digits_impl<T,false,false> // Floating point
{
EIGEN_DEVICE_FUNC
static int run() {
using std::log;
using std::ceil;
typedef typename NumTraits<T>::Real Real;
return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
}
};
template<typename T>
struct default_digits_impl<T,false,true> // Integer
{
EIGEN_DEVICE_FUNC
static int run() { return 0; } static int run() { return 0; }
}; };
@ -118,6 +152,12 @@ template<typename T> struct GenericNumTraits
return internal::default_digits10_impl<T>::run(); return internal::default_digits10_impl<T>::run();
} }
EIGEN_DEVICE_FUNC
static inline int digits()
{
return internal::default_digits_impl<T>::run();
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline Real dummy_precision() static inline Real dummy_precision()
{ {
@ -215,6 +255,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); } static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
}; };
template<> struct NumTraits<std::string> template<> struct NumTraits<std::string>

View File

@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
#endif #endif
/** \returns the number of rows */ /** \returns the number of rows */
inline Index rows() const { return Index(indices().size()); } inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
/** \returns the number of columns */ /** \returns the number of columns */
inline Index cols() const { return Index(indices().size()); } inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
/** \returns the size of a side of the respective square matrix, i.e., the number of indices */ /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
inline Index size() const { return Index(indices().size()); } inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename DenseDerived> template<typename DenseDerived>

View File

@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers. * \a data pointers.
* *
* Here is an example using strides:
* \include Matrix_Map_stride.cpp
* Output: \verbinclude Matrix_Map_stride.out
*
* \see class Map * \see class Map
*/ */
//@{ //@{
@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
resize(size); resize(size);
} }
// We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
template<typename T> template<typename T>
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0) EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@ -917,13 +921,19 @@ namespace internal {
template <typename Derived, typename OtherDerived, bool IsVector> template <typename Derived, typename OtherDerived, bool IsVector>
struct conservative_resize_like_impl struct conservative_resize_like_impl
{ {
#if EIGEN_HAS_TYPE_TRAITS
static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
#else
static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
#endif
static void run(DenseBase<Derived>& _this, Index rows, Index cols) static void run(DenseBase<Derived>& _this, Index rows, Index cols)
{ {
if (_this.rows() == rows && _this.cols() == cols) return; if (_this.rows() == rows && _this.cols() == cols) return;
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows if ( IsRelocatable
(!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns
{ {
internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols); internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
_this.derived().m_storage.conservativeResize(rows*cols,rows,cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@ -951,8 +961,9 @@ struct conservative_resize_like_impl
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows if ( IsRelocatable &&
(!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns
{ {
const Index new_rows = other.rows() - _this.rows(); const Index new_rows = other.rows() - _this.rows();
const Index new_cols = other.cols() - _this.cols(); const Index new_cols = other.cols() - _this.cols();
@ -980,13 +991,18 @@ template <typename Derived, typename OtherDerived>
struct conservative_resize_like_impl<Derived,OtherDerived,true> struct conservative_resize_like_impl<Derived,OtherDerived,true>
: conservative_resize_like_impl<Derived,OtherDerived,false> : conservative_resize_like_impl<Derived,OtherDerived,false>
{ {
using conservative_resize_like_impl<Derived,OtherDerived,false>::run; typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
using Base::run;
using Base::IsRelocatable;
static void run(DenseBase<Derived>& _this, Index size) static void run(DenseBase<Derived>& _this, Index size)
{ {
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols); if(IsRelocatable)
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
else
Base::run(_this.derived(), new_rows, new_cols);
} }
static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other) static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@ -997,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); if(IsRelocatable)
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
else
Base::run(_this.derived(), new_rows, new_cols);
if (num_new_elements > 0) if (num_new_elements > 0)
_this.tail(num_new_elements) = other.tail(num_new_elements); _this.tail(num_new_elements) = other.tail(num_new_elements);

View File

@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions"); && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
} }
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@ -116,7 +116,7 @@ class dense_product_base
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
{}; {};
/** Convertion to scalar for inner-products */ /** Conversion to scalar for inner-products */
template<typename Lhs, typename Rhs, int Option> template<typename Lhs, typename Rhs, int Option>
class dense_product_base<Lhs, Rhs, Option, InnerProduct> class dense_product_base<Lhs, Rhs, Option, InnerProduct>
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@ -127,7 +127,7 @@ public:
using Base::derived; using Base::derived;
typedef typename Base::Scalar Scalar; typedef typename Base::Scalar Scalar;
operator const Scalar() const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
{ {
return internal::evaluator<ProductXpr>(derived()).coeff(0,0); return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
} }
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
public: public:
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
{ {
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
return internal::evaluator<Derived>(derived()).coeff(row,col); return internal::evaluator<Derived>(derived()).coeff(row,col);
} }
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
{ {
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );

View File

@ -20,7 +20,7 @@ namespace internal {
/** \internal /** \internal
* Evaluator of a product expression. * Evaluator of a product expression.
* Since products require special treatments to handle all possible cases, * Since products require special treatments to handle all possible cases,
* we simply deffer the evaluation logic to a product_evaluator class * we simply defer the evaluation logic to a product_evaluator class
* which offers more partial specialization possibilities. * which offers more partial specialization possibilities.
* *
* \sa class product_evaluator * \sa class product_evaluator
@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
typedef Product<Lhs, Rhs, Options> XprType; typedef Product<Lhs, Rhs, Options> XprType;
typedef product_evaluator<XprType> Base; typedef product_evaluator<XprType> Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
}; };
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const Product<Lhs, Rhs, DefaultProduct> > XprType; const Product<Lhs, Rhs, DefaultProduct> > XprType;
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base; typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
{} {}
}; };
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType; typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base; typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>( : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
xpr.index() )) xpr.index() ))
@ -128,7 +128,7 @@ protected:
PlainObject m_result; PlainObject m_result;
}; };
// The following three shortcuts are enabled only if the scalar types match excatly. // The following three shortcuts are enabled only if the scalar types match exactly.
// TODO: we could enable them for different scalar types when the product is not vectorized. // TODO: we could enable them for different scalar types when the product is not vectorized.
// Dense = Product // Dense = Product
@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{ {
typedef Product<Lhs,Rhs,Options> SrcXprType; typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &) void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{ {
Index dstRows = src.rows(); Index dstRows = src.rows();
@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{ {
typedef Product<Lhs,Rhs,Options> SrcXprType; typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &) void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
{ {
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{ {
typedef Product<Lhs,Rhs,Options> SrcXprType; typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &) void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
{ {
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>, typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>, const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
const Product<Lhs,Rhs,DefaultProduct> > SrcXprType; const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
static EIGEN_STRONG_INLINE static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
{ {
call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@ -207,11 +207,17 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
static const bool value = true; static const bool value = true;
}; };
template<typename OtherXpr, typename Lhs, typename Rhs>
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
static const bool value = true;
};
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2> template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
struct assignment_from_xpr_op_product struct assignment_from_xpr_op_product
{ {
template<typename SrcXprType, typename InitialFunc> template<typename SrcXprType, typename InitialFunc>
static EIGEN_STRONG_INLINE static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
{ {
call_assignment_no_alias(dst, src.lhs(), Func1()); call_assignment_no_alias(dst, src.lhs(), Func1());
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct> struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
{ {
template<typename Dst> template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
} }
template<typename Dst> template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
} }
template<typename Dst> template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
}; };
@ -263,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
// Column major result // Column major result
template<typename Dst, typename Lhs, typename Rhs, typename Func> template<typename Dst, typename Lhs, typename Rhs, typename Func>
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{ {
evaluator<Rhs> rhsEval(rhs); evaluator<Rhs> rhsEval(rhs);
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs); ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too // FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols(); const Index cols = dst.cols();
@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// Row major result // Row major result
template<typename Dst, typename Lhs, typename Rhs, typename Func> template<typename Dst, typename Lhs, typename Rhs, typename Func>
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
{ {
evaluator<Lhs> lhsEval(lhs); evaluator<Lhs> lhsEval(lhs);
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs); ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too // FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows(); const Index rows = dst.rows();
@ -294,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
typedef typename Product<Lhs,Rhs>::Scalar Scalar; typedef typename Product<Lhs,Rhs>::Scalar Scalar;
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
struct set { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; struct set { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
struct add { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; struct add { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
struct sub { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct sub { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
struct adds { struct adds {
Scalar m_scale; Scalar m_scale;
explicit adds(const Scalar& s) : m_scale(s) {} explicit adds(const Scalar& s) : m_scale(s) {}
template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
dst.const_cast_derived() += m_scale * src; dst.const_cast_derived() += m_scale * src;
} }
}; };
template<typename Dst> template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
} }
template<typename Dst> template<typename Dst>
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ {
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>()); internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
} }
@ -339,19 +345,19 @@ struct generic_product_impl_base
typedef typename Product<Lhs,Rhs>::Scalar Scalar; typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
}; };
@ -367,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType; typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
template<typename Dest> template<typename Dest>
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ {
LhsNested actual_lhs(lhs); LhsNested actual_lhs(lhs);
RhsNested actual_rhs(rhs); RhsNested actual_rhs(rhs);
@ -384,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
typedef typename Product<Lhs,Rhs>::Scalar Scalar; typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
// Same as: dst.noalias() = lhs.lazyProduct(rhs); // Same as: dst.noalias() = lhs.lazyProduct(rhs);
// but easier on the compiler side // but easier on the compiler side
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>()); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
} }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
// dst.noalias() += lhs.lazyProduct(rhs); // dst.noalias() += lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>()); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
} }
template<typename Dst> template<typename Dst>
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ {
// dst.noalias() -= lhs.lazyProduct(rhs); // dst.noalias() -= lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>()); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
} }
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
// dst {,+,-}= s * (A.lazyProduct(B))
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
// For them, this strategy is also faster than simply by-passing the heap allocation through
// stack allocation.
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
}
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
// overload more specialized.
template<typename Dst, typename LhsT, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
}
// template<typename Dst> // template<typename Dst>
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@ -735,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
typedef typename Product<Lhs,Rhs>::Scalar Scalar; typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dest> template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) static EIGEN_DEVICE_FUNC
void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ {
selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha); selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
} }
@ -779,7 +812,11 @@ public:
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
Alignment = evaluator<MatrixType>::Alignment Alignment = evaluator<MatrixType>::Alignment,
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
}; };
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@ -791,7 +828,10 @@ public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
{ {
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); if(AsScalarProduct)
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
else
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
} }
protected: protected:
@ -845,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
} }
#ifndef __CUDACC__ #ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType> template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{ {
@ -889,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
} }
#ifndef __CUDACC__ #ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType> template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{ {

View File

@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
* \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
*/ */
template<typename Derived> template<typename Derived>
inline Derived& DenseBase<Derived>::setRandom() EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
{ {
return *this = Random(rows(), cols()); return *this = Random(rows(), cols());
} }

View File

@ -23,22 +23,22 @@ namespace internal {
* Part 1 : the logic deciding a strategy for vectorization and unrolling * Part 1 : the logic deciding a strategy for vectorization and unrolling
***************************************************************************/ ***************************************************************************/
template<typename Func, typename Derived> template<typename Func, typename Evaluator>
struct redux_traits struct redux_traits
{ {
public: public:
typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType; typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
enum { enum {
PacketSize = unpacket_traits<PacketType>::size, PacketSize = unpacket_traits<PacketType>::size,
InnerMaxSize = int(Derived::IsRowMajor) InnerMaxSize = int(Evaluator::IsRowMajor)
? Derived::MaxColsAtCompileTime ? Evaluator::MaxColsAtCompileTime
: Derived::MaxRowsAtCompileTime : Evaluator::MaxRowsAtCompileTime
}; };
enum { enum {
MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
&& (functor_traits<Func>::PacketAccess), && (functor_traits<Func>::PacketAccess),
MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
}; };
@ -51,8 +51,8 @@ public:
public: public:
enum { enum {
Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
: Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost, : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
}; };
@ -64,9 +64,9 @@ public:
#ifdef EIGEN_DEBUG_ASSIGN #ifdef EIGEN_DEBUG_ASSIGN
static void debug() static void debug()
{ {
std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
std::cerr.setf(std::ios::hex, std::ios::basefield); std::cerr.setf(std::ios::hex, std::ios::basefield);
EIGEN_DEBUG_VAR(Derived::Flags) EIGEN_DEBUG_VAR(Evaluator::Flags)
std::cerr.unsetf(std::ios::hex); std::cerr.unsetf(std::ios::hex);
EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(InnerMaxSize)
EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(PacketSize)
@ -87,88 +87,88 @@ public:
/*** no vectorization ***/ /*** no vectorization ***/
template<typename Func, typename Derived, int Start, int Length> template<typename Func, typename Evaluator, int Start, int Length>
struct redux_novec_unroller struct redux_novec_unroller
{ {
enum { enum {
HalfLength = Length/2 HalfLength = Length/2
}; };
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
{ {
return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func)); redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
} }
}; };
template<typename Func, typename Derived, int Start> template<typename Func, typename Evaluator, int Start>
struct redux_novec_unroller<Func, Derived, Start, 1> struct redux_novec_unroller<Func, Evaluator, Start, 1>
{ {
enum { enum {
outer = Start / Derived::InnerSizeAtCompileTime, outer = Start / Evaluator::InnerSizeAtCompileTime,
inner = Start % Derived::InnerSizeAtCompileTime inner = Start % Evaluator::InnerSizeAtCompileTime
}; };
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
{ {
return mat.coeffByOuterInner(outer, inner); return eval.coeffByOuterInner(outer, inner);
} }
}; };
// This is actually dead code and will never be called. It is required // This is actually dead code and will never be called. It is required
// to prevent false warnings regarding failed inlining though // to prevent false warnings regarding failed inlining though
// for 0 length run() will never be called at all. // for 0 length run() will never be called at all.
template<typename Func, typename Derived, int Start> template<typename Func, typename Evaluator, int Start>
struct redux_novec_unroller<Func, Derived, Start, 0> struct redux_novec_unroller<Func, Evaluator, Start, 0>
{ {
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
}; };
/*** vectorization ***/ /*** vectorization ***/
template<typename Func, typename Derived, int Start, int Length> template<typename Func, typename Evaluator, int Start, int Length>
struct redux_vec_unroller struct redux_vec_unroller
{ {
enum { enum {
PacketSize = redux_traits<Func, Derived>::PacketSize, PacketSize = redux_traits<Func, Evaluator>::PacketSize,
HalfLength = Length/2 HalfLength = Length/2
}; };
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
{ {
return func.packetOp( return func.packetOp(
redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func), redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) ); redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
} }
}; };
template<typename Func, typename Derived, int Start> template<typename Func, typename Evaluator, int Start>
struct redux_vec_unroller<Func, Derived, Start, 1> struct redux_vec_unroller<Func, Evaluator, Start, 1>
{ {
enum { enum {
index = Start * redux_traits<Func, Derived>::PacketSize, index = Start * redux_traits<Func, Evaluator>::PacketSize,
outer = index / int(Derived::InnerSizeAtCompileTime), outer = index / int(Evaluator::InnerSizeAtCompileTime),
inner = index % int(Derived::InnerSizeAtCompileTime), inner = index % int(Evaluator::InnerSizeAtCompileTime),
alignment = Derived::Alignment alignment = Evaluator::Alignment
}; };
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
{ {
return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner); return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
} }
}; };
@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
* Part 3 : implementation of all cases * Part 3 : implementation of all cases
***************************************************************************/ ***************************************************************************/
template<typename Func, typename Derived, template<typename Func, typename Evaluator,
int Traversal = redux_traits<Func, Derived>::Traversal, int Traversal = redux_traits<Func, Evaluator>::Traversal,
int Unrolling = redux_traits<Func, Derived>::Unrolling int Unrolling = redux_traits<Func, Evaluator>::Unrolling
> >
struct redux_impl; struct redux_impl;
template<typename Func, typename Derived> template<typename Func, typename Evaluator>
struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling> struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
{ {
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) template<typename XprType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{ {
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
Scalar res; Scalar res;
res = mat.coeffByOuterInner(0, 0); res = eval.coeffByOuterInner(0, 0);
for(Index i = 1; i < mat.innerSize(); ++i) for(Index i = 1; i < xpr.innerSize(); ++i)
res = func(res, mat.coeffByOuterInner(0, i)); res = func(res, eval.coeffByOuterInner(0, i));
for(Index i = 1; i < mat.outerSize(); ++i) for(Index i = 1; i < xpr.outerSize(); ++i)
for(Index j = 0; j < mat.innerSize(); ++j) for(Index j = 0; j < xpr.innerSize(); ++j)
res = func(res, mat.coeffByOuterInner(i, j)); res = func(res, eval.coeffByOuterInner(i, j));
return res; return res;
} }
}; };
template<typename Func, typename Derived> template<typename Func, typename Evaluator>
struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling> struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
: public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime> : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
{};
template<typename Func, typename Derived>
struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
{ {
typedef typename Derived::Scalar Scalar; typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; typedef typename Evaluator::Scalar Scalar;
template<typename XprType>
static Scalar run(const Derived &mat, const Func& func) EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
{ {
const Index size = mat.size(); return Base::run(eval,func);
}
};
template<typename Func, typename Evaluator>
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
{
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
template<typename XprType>
static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{
const Index size = xpr.size();
const Index packetSize = redux_traits<Func, Derived>::PacketSize; const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
const int packetAlignment = unpacket_traits<PacketScalar>::alignment; const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
enum { enum {
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
}; };
const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); const Index alignedStart = internal::first_default_aligned(xpr);
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
const Index alignedEnd2 = alignedStart + alignedSize2; const Index alignedEnd2 = alignedStart + alignedSize2;
@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
Scalar res; Scalar res;
if(alignedSize) if(alignedSize)
{ {
PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart); PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
{ {
PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize); PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
{ {
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index)); packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize)); packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
} }
packet_res0 = func.packetOp(packet_res0,packet_res1); packet_res0 = func.packetOp(packet_res0,packet_res1);
if(alignedEnd>alignedEnd2) if(alignedEnd>alignedEnd2)
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2)); packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
} }
res = func.predux(packet_res0); res = func.predux(packet_res0);
for(Index index = 0; index < alignedStart; ++index) for(Index index = 0; index < alignedStart; ++index)
res = func(res,mat.coeff(index)); res = func(res,eval.coeff(index));
for(Index index = alignedEnd; index < size; ++index) for(Index index = alignedEnd; index < size; ++index)
res = func(res,mat.coeff(index)); res = func(res,eval.coeff(index));
} }
else // too small to vectorize anything. else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
{ {
res = mat.coeff(0); res = eval.coeff(0);
for(Index index = 1; index < size; ++index) for(Index index = 1; index < size; ++index)
res = func(res,mat.coeff(index)); res = func(res,eval.coeff(index));
} }
return res; return res;
@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
}; };
// NOTE: for SliceVectorizedTraversal we simply bypass unrolling // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
template<typename Func, typename Derived, int Unrolling> template<typename Func, typename Evaluator, int Unrolling>
struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling> struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
{ {
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketType; typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) template<typename XprType>
EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{ {
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
const Index innerSize = mat.innerSize(); const Index innerSize = xpr.innerSize();
const Index outerSize = mat.outerSize(); const Index outerSize = xpr.outerSize();
enum { enum {
packetSize = redux_traits<Func, Derived>::PacketSize packetSize = redux_traits<Func, Evaluator>::PacketSize
}; };
const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
Scalar res; Scalar res;
if(packetedInnerSize) if(packetedInnerSize)
{ {
PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0); PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
for(Index j=0; j<outerSize; ++j) for(Index j=0; j<outerSize; ++j)
for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize)) for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i)); packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
res = func.predux(packet_res); res = func.predux(packet_res);
for(Index j=0; j<outerSize; ++j) for(Index j=0; j<outerSize; ++j)
for(Index i=packetedInnerSize; i<innerSize; ++i) for(Index i=packetedInnerSize; i<innerSize; ++i)
res = func(res, mat.coeffByOuterInner(j,i)); res = func(res, eval.coeffByOuterInner(j,i));
} }
else // too small to vectorize anything. else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
{ {
res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func); res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
} }
return res; return res;
} }
}; };
template<typename Func, typename Derived> template<typename Func, typename Evaluator>
struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling> struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
{ {
typedef typename Derived::Scalar Scalar; typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar; typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
enum { enum {
PacketSize = redux_traits<Func, Derived>::PacketSize, PacketSize = redux_traits<Func, Evaluator>::PacketSize,
Size = Derived::SizeAtCompileTime, Size = Evaluator::SizeAtCompileTime,
VectorizedSize = (Size / PacketSize) * PacketSize VectorizedSize = (Size / PacketSize) * PacketSize
}; };
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
template<typename XprType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
{ {
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); EIGEN_ONLY_USED_FOR_DEBUG(xpr)
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
if (VectorizedSize > 0) { if (VectorizedSize > 0) {
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func)); Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
if (VectorizedSize != Size) if (VectorizedSize != Size)
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func)); res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
return res; return res;
} }
else { else {
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func); return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
} }
} }
}; };
// evaluator adaptor // evaluator adaptor
template<typename _XprType> template<typename _XprType>
class redux_evaluator class redux_evaluator : public internal::evaluator<_XprType>
{ {
typedef internal::evaluator<_XprType> Base;
public: public:
typedef _XprType XprType; typedef _XprType XprType;
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
typedef typename XprType::Scalar Scalar; typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar; typedef typename XprType::PacketScalar PacketScalar;
typedef typename XprType::PacketReturnType PacketReturnType;
enum { enum {
MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
// TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
Flags = evaluator<XprType>::Flags & ~DirectAccessBit, Flags = Base::Flags & ~DirectAccessBit,
IsRowMajor = XprType::IsRowMajor, IsRowMajor = XprType::IsRowMajor,
SizeAtCompileTime = XprType::SizeAtCompileTime, SizeAtCompileTime = XprType::SizeAtCompileTime,
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
CoeffReadCost = evaluator<XprType>::CoeffReadCost,
Alignment = evaluator<XprType>::Alignment
}; };
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
EIGEN_DEVICE_FUNC
CoeffReturnType coeff(Index row, Index col) const
{ return m_evaluator.coeff(row, col); }
EIGEN_DEVICE_FUNC
CoeffReturnType coeff(Index index) const
{ return m_evaluator.coeff(index); }
template<int LoadMode, typename PacketType>
PacketType packet(Index row, Index col) const
{ return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
template<int LoadMode, typename PacketType>
PacketType packet(Index index) const
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
template<int LoadMode, typename PacketType> template<int LoadMode, typename PacketType>
PacketType packetByOuterInner(Index outer, Index inner) const PacketType packetByOuterInner(Index outer, Index inner) const
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
const XprType & nestedExpression() const { return m_xpr; }
protected:
internal::evaluator<XprType> m_evaluator;
const XprType &m_xpr;
}; };
} // end namespace internal } // end namespace internal
@ -407,7 +395,7 @@ protected:
*/ */
template<typename Derived> template<typename Derived>
template<typename Func> template<typename Func>
typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::redux(const Func& func) const DenseBase<Derived>::redux(const Func& func) const
{ {
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@ -415,14 +403,16 @@ DenseBase<Derived>::redux(const Func& func) const
typedef typename internal::redux_evaluator<Derived> ThisEvaluator; typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
ThisEvaluator thisEval(derived()); ThisEvaluator thisEval(derived());
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func); // The initial expression is passed to the reducer as an additional argument instead of
// passing it as a member of redux_evaluator to help
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
} }
/** \returns the minimum of all coefficients of \c *this. /** \returns the minimum of all coefficients of \c *this.
* \warning the result is undefined if \c *this contains NaN. * \warning the result is undefined if \c *this contains NaN.
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff() const DenseBase<Derived>::minCoeff() const
{ {
return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>()); return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@ -432,7 +422,7 @@ DenseBase<Derived>::minCoeff() const
* \warning the result is undefined if \c *this contains NaN. * \warning the result is undefined if \c *this contains NaN.
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff() const DenseBase<Derived>::maxCoeff() const
{ {
return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>()); return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@ -445,7 +435,7 @@ DenseBase<Derived>::maxCoeff() const
* \sa trace(), prod(), mean() * \sa trace(), prod(), mean()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::sum() const DenseBase<Derived>::sum() const
{ {
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -458,7 +448,7 @@ DenseBase<Derived>::sum() const
* \sa trace(), prod(), sum() * \sa trace(), prod(), sum()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::mean() const DenseBase<Derived>::mean() const
{ {
#ifdef __INTEL_COMPILER #ifdef __INTEL_COMPILER
@ -479,7 +469,7 @@ DenseBase<Derived>::mean() const
* \sa sum(), mean(), trace() * \sa sum(), mean(), trace()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::prod() const DenseBase<Derived>::prod() const
{ {
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -494,7 +484,7 @@ DenseBase<Derived>::prod() const
* \sa diagonal(), sum() * \sa diagonal(), sum()
*/ */
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
MatrixBase<Derived>::trace() const MatrixBase<Derived>::trace() const
{ {
return derived().diagonal().sum(); return derived().diagonal().sum();

View File

@ -95,6 +95,8 @@ protected:
template<typename Expression> template<typename Expression>
EIGEN_DEVICE_FUNC void construct(Expression& expr) EIGEN_DEVICE_FUNC void construct(Expression& expr)
{ {
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
if(PlainObjectType::RowsAtCompileTime==1) if(PlainObjectType::RowsAtCompileTime==1)
{ {
eigen_assert(expr.rows()==1 || expr.cols()==1); eigen_assert(expr.rows()==1 || expr.cols()==1);

View File

@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
*/ */
template<typename Derived> template<typename Derived>
template<int RowFactor, int ColFactor> template<int RowFactor, int ColFactor>
const Replicate<Derived,RowFactor,ColFactor> EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
DenseBase<Derived>::replicate() const DenseBase<Derived>::replicate() const
{ {
return Replicate<Derived,RowFactor,ColFactor>(derived()); return Replicate<Derived,RowFactor,ColFactor>(derived());
@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
* \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
*/ */
template<typename ExpressionType, int Direction> template<typename ExpressionType, int Direction>
const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
{ {
return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType

View File

@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
template<typename Derived> template<typename Derived>
template<typename OtherDerived> template<typename OtherDerived>
Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other) EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
{ {
other.evalTo(derived()); other.evalTo(derived());
return derived(); return derived();

View File

@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
* *
*/ */
template<typename Derived> template<typename Derived>
inline typename DenseBase<Derived>::ReverseReturnType EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
DenseBase<Derived>::reverse() DenseBase<Derived>::reverse()
{ {
return ReverseReturnType(derived()); return ReverseReturnType(derived());
@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
* *
* \sa VectorwiseOp::reverseInPlace(), reverse() */ * \sa VectorwiseOp::reverseInPlace(), reverse() */
template<typename Derived> template<typename Derived>
inline void DenseBase<Derived>::reverseInPlace() EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
{ {
if(cols()>rows()) if(cols()>rows())
{ {
@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
* *
* \sa DenseBase::reverseInPlace(), reverse() */ * \sa DenseBase::reverseInPlace(), reverse() */
template<typename ExpressionType, int Direction> template<typename ExpressionType, int Direction>
void VectorwiseOp<ExpressionType,Direction>::reverseInPlace() EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
{ {
internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived()); internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
} }

View File

@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
{} {
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
}
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); } inline Index rows() const { return m_matrix.rows(); }
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2); TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
} }
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType; typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */ /** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const inline const ConjugateReturnType conjugate() const
@ -322,7 +324,7 @@ public:
/** This is the const version of MatrixBase::selfadjointView() */ /** This is the const version of MatrixBase::selfadjointView() */
template<typename Derived> template<typename Derived>
template<unsigned int UpLo> template<unsigned int UpLo>
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView() const MatrixBase<Derived>::selfadjointView() const
{ {
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived()); return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@ -339,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
*/ */
template<typename Derived> template<typename Derived>
template<unsigned int UpLo> template<unsigned int UpLo>
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView() MatrixBase<Derived>::selfadjointView()
{ {
return typename SelfAdjointViewReturnType<UpLo>::Type(derived()); return typename SelfAdjointViewReturnType<UpLo>::Type(derived());

View File

@ -15,33 +15,29 @@ namespace Eigen {
// TODO generalize the scalar type of 'other' // TODO generalize the scalar type of 'other'
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }
template<typename Derived> template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{ {
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>()); internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
return derived(); return derived();
} }

View File

@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
template<typename Decomposition, typename RhsType> template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense> struct solve_traits<Decomposition,RhsType,Dense>
{ {
typedef Matrix<typename RhsType::Scalar, typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime, Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime, RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options, RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime, Decomposition::MaxColsAtCompileTime,
RhsType::MaxColsAtCompileTime> PlainObject; RhsType::MaxColsAtCompileTime>::type PlainObject;
}; };
template<typename Decomposition, typename RhsType> template<typename Decomposition, typename RhsType>
@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
} }
}; };
} // end namepsace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
#ifndef EIGEN_PARSED_BY_DOXYGEN #ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename MatrixType, unsigned int Mode> template<typename MatrixType, unsigned int Mode>
template<int Side, typename OtherDerived> template<int Side, typename OtherDerived>
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
{ {
OtherDerived& other = _other.const_cast_derived(); OtherDerived& other = _other.const_cast_derived();
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );

View File

@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime, MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
internal::traits<Derived>::MaxColsAtCompileTime>::ret), internal::traits<Derived>::MaxColsAtCompileTime>::ret),
IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1 IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
|| internal::traits<Derived>::MaxColsAtCompileTime == 1 || internal::traits<Derived>::MaxColsAtCompileTime == 1,
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
}; };
/** Default constructor */ /** Default constructor */

View File

@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
ssq += (bl*invScale).squaredNorm(); ssq += (bl*invScale).squaredNorm();
} }
template<typename VectorType, typename RealScalar>
void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
{
typedef typename VectorType::Scalar Scalar;
const Index blockSize = 4096;
typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
const VectorTypeCopy copy(vec);
enum {
CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
Index n = vec.size();
Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
}
template<typename VectorType>
typename VectorType::RealScalar
stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
{
using std::sqrt;
using std::abs;
Index n = vec.size();
if(n==1)
return abs(vec.coeff(0));
typedef typename VectorType::RealScalar RealScalar;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of squares
stable_norm_impl_inner_step(vec, ssq, scale, invScale);
return scale * sqrt(ssq);
}
template<typename MatrixType>
typename MatrixType::RealScalar
stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
{
using std::sqrt;
typedef typename MatrixType::RealScalar RealScalar;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of squares
for(Index j=0; j<mat.outerSize(); ++j)
stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
return scale * sqrt(ssq);
}
template<typename Derived> template<typename Derived>
inline typename NumTraits<typename traits<Derived>::Scalar>::Real inline typename NumTraits<typename traits<Derived>::Scalar>::Real
blueNorm_impl(const EigenBase<Derived>& _vec) blueNorm_impl(const EigenBase<Derived>& _vec)
@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
// are used. For any specific computer, each of the assignment // are used. For any specific computer, each of the assignment
// statements can be replaced // statements can be replaced
ibeta = std::numeric_limits<RealScalar>::radix; // base for floating-point numbers ibeta = std::numeric_limits<RealScalar>::radix; // base for floating-point numbers
it = std::numeric_limits<RealScalar>::digits; // number of base-beta digits in mantissa it = NumTraits<RealScalar>::digits(); // number of base-beta digits in mantissa
iemin = std::numeric_limits<RealScalar>::min_exponent; // minimum exponent iemin = std::numeric_limits<RealScalar>::min_exponent; // minimum exponent
iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent
rbig = (std::numeric_limits<RealScalar>::max)(); // largest floating-point number rbig = (std::numeric_limits<RealScalar>::max)(); // largest floating-point number
@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
RealScalar asml = RealScalar(0); RealScalar asml = RealScalar(0);
RealScalar amed = RealScalar(0); RealScalar amed = RealScalar(0);
RealScalar abig = RealScalar(0); RealScalar abig = RealScalar(0);
for(typename Derived::InnerIterator it(vec, 0); it; ++it)
for(Index j=0; j<vec.outerSize(); ++j)
{ {
RealScalar ax = abs(it.value()); for(typename Derived::InnerIterator it(vec, j); it; ++it)
if(ax > ab2) abig += numext::abs2(ax*s2m); {
else if(ax < b1) asml += numext::abs2(ax*s1m); RealScalar ax = abs(it.value());
else amed += numext::abs2(ax); if(ax > ab2) abig += numext::abs2(ax*s2m);
else if(ax < b1) asml += numext::abs2(ax*s1m);
else amed += numext::abs2(ax);
}
} }
if(amed!=amed) if(amed!=amed)
return amed; // we got a NaN return amed; // we got a NaN
@ -156,35 +225,7 @@ template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::stableNorm() const MatrixBase<Derived>::stableNorm() const
{ {
using std::sqrt; return internal::stable_norm_impl(derived());
using std::abs;
const Index blockSize = 4096;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of square
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
DerivedCopy copy(derived());
enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
Index n = size();
if(n==1)
return abs(this->coeff(0));
Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
return scale * sqrt(ssq);
} }
/** \returns the \em l2 norm of \c *this using the Blue's algorithm. /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@ -212,7 +253,10 @@ template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::hypotNorm() const MatrixBase<Derived>::hypotNorm() const
{ {
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>()); if(size()==1)
return numext::abs(coeff(0,0));
else
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
} }
} // end namespace Eigen } // end namespace Eigen

View File

@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
nestedExpression() { return m_matrix; } nestedExpression() { return m_matrix; }
/** \internal */ /** \internal */
EIGEN_DEVICE_FUNC
void resize(Index nrows, Index ncols) { void resize(Index nrows, Index ncols) {
m_matrix.resize(ncols,nrows); m_matrix.resize(ncols,nrows);
} }
@ -168,7 +169,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
* *
* \sa transposeInPlace(), adjoint() */ * \sa transposeInPlace(), adjoint() */
template<typename Derived> template<typename Derived>
inline Transpose<Derived> EIGEN_DEVICE_FUNC inline Transpose<Derived>
DenseBase<Derived>::transpose() DenseBase<Derived>::transpose()
{ {
return TransposeReturnType(derived()); return TransposeReturnType(derived());
@ -180,7 +181,7 @@ DenseBase<Derived>::transpose()
* *
* \sa transposeInPlace(), adjoint() */ * \sa transposeInPlace(), adjoint() */
template<typename Derived> template<typename Derived>
inline typename DenseBase<Derived>::ConstTransposeReturnType EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
DenseBase<Derived>::transpose() const DenseBase<Derived>::transpose() const
{ {
return ConstTransposeReturnType(derived()); return ConstTransposeReturnType(derived());
@ -206,7 +207,7 @@ DenseBase<Derived>::transpose() const
* *
* \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
template<typename Derived> template<typename Derived>
inline const typename MatrixBase<Derived>::AdjointReturnType EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
MatrixBase<Derived>::adjoint() const MatrixBase<Derived>::adjoint() const
{ {
return AdjointReturnType(this->transpose()); return AdjointReturnType(this->transpose());
@ -281,7 +282,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
* *
* \sa transpose(), adjoint(), adjointInPlace() */ * \sa transpose(), adjoint(), adjointInPlace() */
template<typename Derived> template<typename Derived>
inline void DenseBase<Derived>::transposeInPlace() EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
{ {
eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
&& "transposeInPlace() called on a non-square non-resizable matrix"); && "transposeInPlace() called on a non-square non-resizable matrix");
@ -312,7 +313,7 @@ inline void DenseBase<Derived>::transposeInPlace()
* *
* \sa transpose(), adjoint(), transposeInPlace() */ * \sa transpose(), adjoint(), transposeInPlace() */
template<typename Derived> template<typename Derived>
inline void MatrixBase<Derived>::adjointInPlace() EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
{ {
derived() = adjoint().eval(); derived() = adjoint().eval();
} }

View File

@ -84,7 +84,7 @@ class TranspositionsBase
} }
// FIXME: do we want such methods ? // FIXME: do we want such methods ?
// might be usefull when the target matrix expression is complex, e.g.: // might be useful when the target matrix expression is complex, e.g.:
// object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
/* /*
template<typename MatrixType> template<typename MatrixType>
@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
const Product<OtherDerived, Transpose, AliasFreeProduct> const Product<OtherDerived, Transpose, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt) operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
{ {
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived()); return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
} }
/** \returns the \a matrix with the inverse transpositions applied to the rows. /** \returns the \a matrix with the inverse transpositions applied to the rows.

View File

@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
inline Index innerStride() const { return derived().innerStride(); } inline Index innerStride() const { return derived().innerStride(); }
// dummy resize function // dummy resize function
EIGEN_DEVICE_FUNC
void resize(Index rows, Index cols) void resize(Index rows, Index cols)
{ {
EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(rows);
@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if
* \a Side==OnTheRight. * \a Side==OnTheRight.
* *
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
* *
* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@ -488,7 +489,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \sa TriangularView::solveInPlace() * \sa TriangularView::solveInPlace()
*/ */
template<int Side, typename Other> template<int Side, typename Other>
EIGEN_DEVICE_FUNC
inline const internal::triangular_solve_retval<Side,TriangularViewType, Other> inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
solve(const MatrixBase<Other>& other) const; solve(const MatrixBase<Other>& other) const;
@ -497,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here. * This function will const_cast it, so constness isn't honored here.
* *
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
* *
* See TriangularView:solve() for the details. * See TriangularView:solve() for the details.
*/ */
@ -554,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
// FIXME should we keep that possibility // FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode> template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived> template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>& EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other) TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
{ {
internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>()); internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@ -564,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
// FIXME should we keep that possibility // FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode> template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived> template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other) EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
{ {
internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>()); internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
} }
@ -573,7 +573,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
template<typename MatrixType, unsigned int Mode> template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived> template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>& EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other) TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
{ {
eigen_assert(Mode == int(OtherDerived::Mode)); eigen_assert(Mode == int(OtherDerived::Mode));
@ -583,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
template<typename MatrixType, unsigned int Mode> template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived> template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other) EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
{ {
eigen_assert(Mode == int(OtherDerived::Mode)); eigen_assert(Mode == int(OtherDerived::Mode));
internal::call_assignment_no_alias(derived(), other.derived()); internal::call_assignment_no_alias(derived(), other.derived());
@ -598,7 +598,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
* If the matrix is triangular, the opposite part is set to zero. */ * If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived> template<typename Derived>
template<typename DenseDerived> template<typename DenseDerived>
void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
{ {
evalToLazy(other.derived()); evalToLazy(other.derived());
} }
@ -624,6 +624,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
*/ */
template<typename Derived> template<typename Derived>
template<unsigned int Mode> template<unsigned int Mode>
EIGEN_DEVICE_FUNC
typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView() MatrixBase<Derived>::triangularView()
{ {
@ -633,6 +634,7 @@ MatrixBase<Derived>::triangularView()
/** This is the const version of MatrixBase::triangularView() */ /** This is the const version of MatrixBase::triangularView() */
template<typename Derived> template<typename Derived>
template<unsigned int Mode> template<unsigned int Mode>
EIGEN_DEVICE_FUNC
typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView() const MatrixBase<Derived>::triangularView() const
{ {
@ -715,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
{ {
typedef TriangularView<MatrixType,Mode> XprType; typedef TriangularView<MatrixType,Mode> XprType;
typedef evaluator<typename internal::remove_all<MatrixType>::type> Base; typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
EIGEN_DEVICE_FUNC
unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
}; };
@ -930,7 +933,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
* If the matrix is triangular, the opposite part is set to zero. */ * If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived> template<typename Derived>
template<typename DenseDerived> template<typename DenseDerived>
void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
{ {
other.derived().resize(this->rows(), this->cols()); other.derived().resize(this->rows(), this->cols());
internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression()); internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());

View File

@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
* It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
* most of the time this is the only way it is used. * most of the time this is the only way it is used.
* *
* However, if you want to directly maniputate sub-vector expressions, * However, if you want to directly manipulate sub-vector expressions,
* for instance if you want to write a function returning such an expression, you * for instance if you want to write a function returning such an expression, you
* will need to use this class. * will need to use this class.
* *

View File

@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/ */
template<typename Derived> template<typename Derived>
inline typename DenseBase<Derived>::ColwiseReturnType EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
DenseBase<Derived>::colwise() DenseBase<Derived>::colwise()
{ {
return ColwiseReturnType(derived()); return ColwiseReturnType(derived());
@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
* \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/ */
template<typename Derived> template<typename Derived>
inline typename DenseBase<Derived>::RowwiseReturnType EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
DenseBase<Derived>::rowwise() DenseBase<Derived>::rowwise()
{ {
return RowwiseReturnType(derived()); return RowwiseReturnType(derived());

View File

@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
} }
}; };
template<> struct conj_helper<Packet8f, Packet4cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{ {
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
} }
}; };
template<> struct conj_helper<Packet4d, Packet2cd, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{ {

View File

@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
} }
#ifndef EIGEN_VECTORIZE_AVX512 #ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif #endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{ {
__m256d tmp = _mm256_shuffle_pd(a,a,5); __m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1); return _mm256_permute2f128_pd(tmp, tmp, 1);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1); __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5); return _mm256_permute_pd(swap_halves,5);
#endif
} }
// pabs should be ok // pabs should be ok
@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
} }
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a) template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
{ {
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
} }

View File

@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
// x = x + x - 1.0; // x = x + x - 1.0;
// } else { x = x - 1.0; } // } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
x = psub(x, p16f_1); x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
x = padd(x, tmp); x = padd(x, tmp);
Packet16f x2 = pmul(x, x); Packet16f x2 = pmul(x, x);
@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y2); x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, return _mm512_mask_blend_ps(iszero_mask,
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
p16f_minus_inf);
} }
#endif #endif
@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
template <> template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) { psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); __mmask16 denormal_mask = _mm512_kand(
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
Packet16f neg_half = pmul(_x, p16f_minus_half); Packet16f x = _mm512_rsqrt14_ps(_x);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
_mm512_setzero_ps());
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
// Multiply the original _x by it's reciprocal square root to extract the // Flush results for denormals to zero.
// square root. return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
return pmul(_x, x);
} }
template <> template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) { psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); __mmask16 denormal_mask = _mm512_kand(
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
_CMP_LT_OQ),
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
Packet8d neg_half = pmul(_x, p8d_minus_half); Packet8d x = _mm512_rsqrt14_pd(_x);
// select only the inverse sqrt of positive normal inputs (denormals are // Do a single step of Newton's iteration.
// flushed to zero and cause infs as well). x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
_mm512_setzero_pd());
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a second step of Newton's iteration. // Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
// Multiply the original _x by it's reciprocal square root to extract the return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
// square root.
return pmul(_x, x);
} }
#else #else
template <> template <>
@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are // select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well). // flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
_mm512_rsqrt14_ps(_x));
// Fill in NaNs and Infs for the negative/zero entries. // Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps( Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, p16f_nan, neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
// Do a single step of Newton's iteration. // Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places. // Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
} }
template <> template <>
@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are // select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well). // flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
_mm512_rsqrt14_pd(_x));
// Fill in NaNs and Infs for the negative/zero entries. // Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd( Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, p8d_nan, neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
// Do a first step of Newton's iteration. // Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places. // Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
} }
#else #elif defined(EIGEN_VECTORIZE_AVX512ER)
template <> template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) { EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x); return _mm512_rsqrt28_ps(x);

View File

@ -54,6 +54,7 @@ template<> struct packet_traits<float> : default_packet_traits
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 16, size = 16,
HasHalfPacket = 1, HasHalfPacket = 1,
HasBlend = 0,
#if EIGEN_GNUC_AT_LEAST(5, 3) #if EIGEN_GNUC_AT_LEAST(5, 3)
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1, HasLog = 1,
@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
return pairs; return pairs;
} }
#ifdef EIGEN_VECTORIZE_AVX512DQ
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3} // a3}
template <> template <>
@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
return x; return x;
} }
#else
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
__m512d x = _mm512_setzero_pd();
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
return x;
}
#endif
// Loads 4 floats from memory a returns the packet // Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
template <> template <>
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
Index stride) { Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride); Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier = Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
template <> template <>
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
Index stride) { Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride); Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@ -558,7 +572,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
const Packet16f& from, const Packet16f& from,
Index stride) { Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride); Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier = Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -568,7 +582,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
const Packet8d& from, const Packet8d& from,
Index stride) { Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride); Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8); _mm512_i32scatter_pd(to, indices, from, 8);
@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
pstore(to, pa); pstore(to, pa);
} }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template <> template <>
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) { EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
{ {
// _mm512_abs_ps intrinsic not found, so hack around it // _mm512_abs_ps intrinsic not found, so hack around it
return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
// _mm512_abs_ps intrinsic not found, so hack around it // _mm512_abs_ps intrinsic not found, so hack around it
return (__m512d)_mm512_and_si512((__m512i)a, return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
_mm512_set1_epi64(0x7fffffffffffffff)); _mm512_set1_epi64(0x7fffffffffffffff)));
} }
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
#else #else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
__m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); __m512d final_output = _mm512_castpd256_pd512(final_0);
return _mm512_insertf64x4(final_output, final_1, 1); return _mm512_insertf64x4(final_output, final_1, 1);
} }
@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) { EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ #ifdef EIGEN_VECTORIZE_AVX512DQ
__m256 lane0 = _mm512_extractf32x8_ps(a, 0); __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
__m256 lane1 = _mm512_extractf32x8_ps(a, 1); __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
#endif #endif
} }
template <> template <>
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) { EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0); __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1); __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_add_pd(lane0, lane1); __m256d res = _mm256_add_pd(lane0, lane1);
@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
return Packet16f(); return Packet16f();
} }
template <> template <>
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
const Packet8d& /*thenPacket*/, const Packet8d& thenPacket,
const Packet8d& /*elsePacket*/) { const Packet8d& elsePacket) {
assert(false && "To be implemented"); __mmask8 m = (ifPacket.select[0] )
return Packet8d(); | (ifPacket.select[1]<<1)
| (ifPacket.select[2]<<2)
| (ifPacket.select[3]<<3)
| (ifPacket.select[4]<<4)
| (ifPacket.select[5]<<5)
| (ifPacket.select[6]<<6)
| (ifPacket.select[7]<<7);
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
}
template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
{
return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
}
template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
{
return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
}
template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
{
return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
}
template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
{
return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
} }
} // end namespace internal } // end namespace internal

View File

@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
template<> struct conj_helper<Packet4f, Packet2cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
return pconj(internal::pmul(a, b)); return pconj(internal::pmul(a, b));
} }
}; };
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {

View File

@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else #else
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_min(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_max(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
} }
#else #else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX // We also need to redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{ {
EIGEN_DEBUG_UNALIGNED_LOAD EIGEN_DEBUG_UNALIGNED_LOAD
@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
} }
#else #else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX // We also need to redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
{ {
EIGEN_DEBUG_ALIGNED_STORE EIGEN_DEBUG_ALIGNED_STORE
@ -764,7 +784,7 @@ typedef __vector __bool long Packet2bl;
static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ONE = { 1, 1 };
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
static Packet2d p2d_MZERO = { -0.0, -0.0 }; static Packet2d p2d_MZERO = { -0.0, -0.0 };
@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
// for some weird raisons, it has to be overloaded for packet of integers // for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
{
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet2d ret;
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
{
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet2d ret;
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
Packet2d v[2], sum; Packet2d v[2], sum;
v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
#ifdef _BIG_ENDIAN #ifdef _BIG_ENDIAN
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
#else #else
@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
return vec_sel(elsePacket, thenPacket, mask); return vec_sel(elsePacket, thenPacket, mask);
} }
#endif // __VSX__ #endif // __VSX__

View File

@ -16,7 +16,7 @@ namespace Eigen {
namespace internal { namespace internal {
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) #if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and // Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device // operator/ are not constexpr. Due to this, clang does not treat them as device
@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
// Product // Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum { enum {
Vectorizable = packet_traits<std::complex<T>>::HasMul Vectorizable = packet_traits<std::complex<T> >::HasMul
}; };
typedef typename std::complex<T> result_type; typedef typename std::complex<T> result_type;
@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
// Quotient // Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum { enum {
Vectorizable = packet_traits<std::complex<T>>::HasDiv Vectorizable = packet_traits<std::complex<T> >::HasDiv
}; };
typedef typename std::complex<T> result_type; typedef typename std::complex<T> result_type;

View File

@ -0,0 +1,29 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
}; \
\
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
};
#endif // EIGEN_ARCH_CONJ_HELPER_H

View File

@ -13,7 +13,7 @@
// Redistribution and use in source and binary forms, with or without // Redistribution and use in source and binary forms, with or without
// modification, are permitted. // modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@ -26,15 +26,15 @@
// Standard 16-bit float type, mostly useful for GPUs. Defines a new // Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with // type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
// operator overloads such that it behaves basically as an arithmetic // operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay // type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O // in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs. // to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H #ifndef EIGEN_HALF_GPU_H
#define EIGEN_HALF_CUDA_H #define EIGEN_HALF_GPU_H
#if __cplusplus > 199711L #if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@ -49,39 +49,107 @@ struct half;
namespace half_impl { namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16) #if !defined(EIGEN_HAS_GPU_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
// Make our own __half definition that is similar to CUDA's. struct __half_raw {
struct __half { EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
EIGEN_DEVICE_FUNC __half() : x(0) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
unsigned short x; unsigned short x;
}; };
#elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
// Make a __half_raw definition that is
// ++ compatible with that of Eigen and
// ++ add an implicit conversion to the native __half of the old HIP implementation.
//
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
//
// In the old HIP implementation,
// ++ __half is a typedef of __fp16
// ++ the "__h*" routines take "__half" arguments
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
union {
unsigned short x;
__half data;
};
operator __half(void) const { return data; }
};
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif // defined(EIGEN_HAS_CUDA_FP16)
#elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
typedef cl::sycl::half __half_raw;
#endif #endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half { struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
#else
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
#endif
}; };
} // namespace half_impl } // namespace half_impl
// Class definition. // Class definition.
struct half : public half_impl::half_base { struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_CUDA_FP16)
typedef half_impl::__half __half; // Writing this out as separate #if-else blocks to make the code easier to follow
#endif // The same applies to most #if-else blocks in this file
#if !defined(EIGEN_HAS_GPU_FP16)
typedef half_impl::__half_raw __half_raw;
#elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
typedef half_impl::__half_raw __half_raw;
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
typedef half_impl::__half_raw __half_raw;
#endif
#endif
EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
#endif
explicit EIGEN_DEVICE_FUNC half(bool b) explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@ -136,72 +204,136 @@ struct half : public half_impl::half_base {
x = other.x; x = other.x;
return *this; return *this;
} }
}; };
} // end namespace Eigen
namespace std {
template<>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
// If std::numeric_limits<T> is specialized, should also specialize
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
template<>
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
} // end namespace std
namespace Eigen {
namespace half_impl { namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
// Intrinsics for native fp16 support. Note that on current hardware, // Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2 // these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the // versions to get the ALU speed increased), but you do save the
// conversion steps back and forth. // conversion steps back and forth.
__device__ half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b); return __hadd(a, b);
#endif
} }
__device__ half operator * (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b); return __hmul(a, b);
} }
__device__ half operator - (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b); return __hsub(a, b);
} }
__device__ half operator / (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hdiv(a, b);
#else
float num = __half2float(a); float num = __half2float(a);
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
#endif
} }
__device__ half operator - (const half& a) { EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a); return __hneg(a);
} }
__device__ half& operator += (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b; a = a + b;
return a; return a;
} }
__device__ half& operator *= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b; a = a * b;
return a; return a;
} }
__device__ half& operator -= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b; a = a - b;
return a; return a;
} }
__device__ half& operator /= (half& a, const half& b) { EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b; a = a / b;
return a; return a;
} }
__device__ bool operator == (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b); return __heq(a, b);
} }
__device__ bool operator != (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b); return __hne(a, b);
} }
__device__ bool operator < (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b); return __hlt(a, b);
} }
__device__ bool operator <= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b); return __hle(a, b);
} }
__device__ bool operator > (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b); return __hgt(a, b);
} }
__device__ bool operator >= (const half& a, const half& b) { EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b); return __hge(a, b);
} }
#else // Emulate support for half floats #else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion // Definitions for CPUs and older HIP+CUDA, mostly working through conversion
// to/from fp32. // to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
return a; return a;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
return float(a) == float(b); return numext::equal_strict(float(a),float(b));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
return float(a) != float(b); return numext::not_equal_strict(float(a), float(b));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
return float(a) < float(b); return float(a) < float(b);
@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are // these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly. // also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
__half h; __half_raw h;
h.x = x; h.x = x;
return h; return h;
} }
union FP32 { union float32_bits {
unsigned int u; unsigned int u;
float f; float f;
}; };
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
return __float2half(ff); (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C) #elif defined(EIGEN_HAS_FP16_C)
__half h; __half_raw h;
h.x = _cvtss_sh(ff, 0); h.x = _cvtss_sh(ff, 0);
return h; return h;
#else #else
FP32 f; f.f = ff; float32_bits f; f.f = ff;
const FP32 f32infty = { 255 << 23 }; const float32_bits f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 }; const float32_bits f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u; unsigned int sign_mask = 0x80000000u;
__half o; __half_raw o;
o.x = static_cast<unsigned short>(0x0u); o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask; unsigned int sign = f.u & sign_mask;
@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(h); return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C) #elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x); return _cvtsh_ss(h.x);
#else #else
const FP32 magic = { 113 << 23 }; const float32_bits magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o; float32_bits o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent unsigned int exp = shifted_exp & o.u; // just the exponent
@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00; return (a.x & 0x7fff) == 0x7c00;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hisnan(a); return __hisnan(a);
#else #else
return (a.x & 0x7fff) > 0x7c00; return (a.x & 0x7fff) > 0x7c00;
@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result; return result;
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a)); return half(hexp(a));
#else #else
return half(::expf(float(a))); return half(::expf(float(a)));
@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a))); return half(numext::expm1(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return half(::hlog(a)); return half(::hlog(a));
#else #else
return half(::logf(float(a))); return half(::logf(float(a)));
@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a))); return half(::log10f(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a)); return half(hsqrt(a));
#else #else
return half(::sqrtf(float(a))); return half(::sqrtf(float(a)));
@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a))); return half(::tanhf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a)); return half(hfloor(a));
#else #else
return half(::floorf(float(a))); return half(::floorf(float(a)));
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a)); return half(hceil(a));
#else #else
return half(::ceilf(float(a))); return half(::ceilf(float(a)));
@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(b, a) ? b : a; return __hlt(b, a) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif #endif
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(a, b) ? b : a; return __hlt(a, b) ? b : a;
#else #else
const float f1 = static_cast<float>(a); const float f1 = static_cast<float>(a);
@ -496,6 +639,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
template<> struct NumTraits<Eigen::half> template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half> : GenericNumTraits<Eigen::half>
{ {
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800); return half_impl::raw_uint16_to_half(0x0800);
} }
@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a))); return Eigen::half(::expf(float(a)));
} }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return Eigen::half(::hlog(a)); return Eigen::half(::hlog(a));
#else #else
return Eigen::half(::logf(float(a))); return Eigen::half(::logf(float(a)));
@ -560,14 +711,22 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic // Add the missing shfl_xor intrinsic
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
#if (EIGEN_CUDACC_VER < 90000) || \
defined(EIGEN_HAS_HIP_FP16)
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
#else
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
#endif
} }
#endif #endif
// ldg() has an overload for __half, but we also need one for Eigen::half. // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half( return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr))); __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif #endif
#if defined(__CUDA_ARCH__) #if defined(EIGEN_GPU_COMPILE_PHASE)
namespace Eigen { namespace Eigen {
namespace numext { namespace numext {
@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) {
} // namespace numext } // namespace numext
#endif #endif
#endif // EIGEN_HALF_CUDA_H #endif // EIGEN_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H #ifndef EIGEN_MATH_FUNCTIONS_GPU_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H #define EIGEN_MATH_FUNCTIONS_GPU_H
namespace Eigen { namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to // Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones // introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...) // we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a) float4 plog<float4>(const float4& a)
{ {
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H #endif // EIGEN_MATH_FUNCTIONS_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H #ifndef EIGEN_PACKET_MATH_GPU_H
#define EIGEN_PACKET_MATH_CUDA_H #define EIGEN_PACKET_MATH_GPU_H
namespace Eigen { namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to // Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones // introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...) // we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; };
@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
HasPolygamma = 1, HasPolygamma = 1,
HasErf = 1, HasErf = 1,
HasErfc = 1, HasErfc = 1,
HasI0e = 1,
HasI1e = 1,
HasIGamma = 1, HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1, HasIGammac = 1,
HasBetaInc = 1, HasBetaInc = 1,
@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
HasPolygamma = 1, HasPolygamma = 1,
HasErf = 1, HasErf = 1,
HasErfc = 1, HasErfc = 1,
HasI0e = 1,
HasI1e = 1,
HasIGamma = 1, HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1, HasIGammac = 1,
HasBetaInc = 1, HasBetaInc = 1,
@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
return make_double2(from[0], from[1]); return make_double2(from[0], from[1]);
} }
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
return make_float4(from[0], from[0], from[1], from[1]); return make_float4(from[0], from[0], from[1], from[1]);
} }
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
return make_double2(from[0], from[0]); return make_double2(from[0], from[0]);
} }
@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const float4*)from); return __ldg((const float4*)from);
#else #else
return make_float4(from[0], from[1], from[2], from[3]); return make_float4(from[0], from[1], from[2], from[3]);
@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
} }
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const double2*)from); return __ldg((const double2*)from);
#else #else
return make_double2(from[0], from[1]); return make_double2(from[0], from[1]);
@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else #else
return make_float4(from[0], from[1], from[2], from[3]); return make_float4(from[0], from[1], from[2], from[3]);
@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
} }
template<> template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_double2(__ldg(from+0), __ldg(from+1)); return make_double2(__ldg(from+0), __ldg(from+1));
#else #else
return make_double2(from[0], from[1]); return make_double2(from[0], from[1]);
@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
EIGEN_DEVICE_FUNC inline void EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) { ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y; float tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x; kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp; kernel.packet[1].x = tmp;
@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H #endif // EIGEN_PACKET_MATH_GPU_H

View File

@ -7,15 +7,16 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H #ifndef EIGEN_PACKET_MATH_HALF_GPU_H
#define EIGEN_PACKET_MATH_HALF_CUDA_H #define EIGEN_PACKET_MATH_HALF_GPU_H
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
// Most of the following operations require arch >= 3.0 // Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
template<> struct is_arithmetic<half2> { enum { value = true }; }; template<> struct is_arithmetic<half2> { enum { value = true }; };
@ -42,70 +43,108 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return half2half2(from);
#else
return __half2half2(from); return __half2half2(from);
#endif
#else // EIGEN_CUDA_ARCH
return __half2half2(from);
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
return *reinterpret_cast<const half2*>(from); return *reinterpret_cast<const half2*>(from);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[1]); return __halves2half2(from[0], from[1]);
} }
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]); return __halves2half2(from[0], from[0]);
} }
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from; *reinterpret_cast<half2*>(to) = from;
} }
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
to[0] = __low2half(from); to[0] = __low2half(from);
to[1] = __high2half(from); to[1] = __high2half(from);
} }
template<> template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __ldg((const half2*)from);
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from); return __ldg((const half2*)from);
#else #else
return __halves2half2(*(from+0), *(from+1)); return __halves2half2(*(from+0), *(from+1));
#endif #endif
#endif
} }
template<> template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __halves2half2(__ldg(from+0), __ldg(from+1));
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1)); return __halves2half2(__ldg(from+0), __ldg(from+1));
#else #else
return __halves2half2(*(from+0), *(from+1)); return __halves2half2(*(from+0), *(from+1));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
return __halves2half2(from[0*stride], from[1*stride]); return __halves2half2(from[0*stride], from[1*stride]);
} }
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
to[stride*0] = __low2half(from); to[stride*0] = __low2half(from);
to[stride*1] = __high2half(from); to[stride*1] = __high2half(from);
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
return __low2half(a); return __low2half(a);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result; half2 result;
result.x = a.x & 0x7FFF7FFF; unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result; return result;
} }
__device__ EIGEN_STRONG_INLINE void EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) { ptranspose(PacketBlock<half2,2>& kernel) {
__half a1 = __low2half(kernel.packet[0]); __half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]); __half a2 = __high2half(kernel.packet[0]);
@ -115,17 +154,31 @@ ptranspose(PacketBlock<half2,2>& kernel) {
kernel.packet[1] = __halves2half2(a2, b2); kernel.packet[1] = __halves2half2(a2, b2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f))); return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else #else
float f = __half2float(a) + 1.0f; float f = __half2float(a) + 1.0f;
return __halves2half2(a, __float2half(f)); return __halves2half2(a, __float2half(f));
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b); return __hadd2(a, b);
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
float r2 = a2 + b2; float r2 = a2 + b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hsub2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b); return __hsub2(a, b);
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
float r2 = a2 - b2; float r2 = a2 - b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hneg2(a);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hneg2(a); return __hneg2(a);
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2); return __floats2half2_rn(-a1, -a2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b); return __hmul2(a, b);
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
float r2 = a2 * b2; float r2 = a2 * b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hfma2(a, b, c);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c); return __hfma2(a, b, c);
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
float r2 = a2 * b2 + c2; float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return h2div(a, b);
#else
return __h2div(a, b);
#endif
#else // EIGEN_CUDA_ARCH
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
float r1 = a1 / b1; float r1 = a1 / b1;
float r2 = a2 / b2; float r2 = a2 / b2;
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
return __halves2half2(r1, r2); return __halves2half2(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float b1 = __low2float(b); float b1 = __low2float(b);
@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
return __halves2half2(r1, r2); return __halves2half2(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a)); return __hadd(__low2half(a), __high2half(a));
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); return Eigen::half(__float2half(a1 + a2));
#endif
#endif #endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
return __hgt(first, second) ? first : second; return __hgt(first, second) ? first : second;
@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
float a2 = __high2float(a); float a2 = __high2float(a);
return a1 > a2 ? __low2half(a) : __high2half(a); return a1 > a2 ? __low2half(a) : __high2half(a);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a); __half first = __low2half(a);
__half second = __high2half(a); __half second = __high2half(a);
return __hlt(first, second) ? first : second; return __hlt(first, second) ? first : second;
@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
float a2 = __high2float(a); float a2 = __high2float(a);
return a1 < a2 ? __low2half(a) : __high2half(a); return a1 < a2 ? __low2half(a) : __high2half(a);
#endif #endif
#endif
} }
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530 #if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a)); return __hmul(__low2half(a), __high2half(a));
#else #else
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); return Eigen::half(__float2half(a1 * a2));
#endif
#endif #endif
} }
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = log1pf(a1); float r1 = log1pf(a1);
@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = expm1f(a1); float r1 = expm1f(a1);
@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 #if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
template<> __device__ EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) { half2 plog<half2>(const half2& a) {
return h2log(a); return h2log(a);
} }
template<> __device__ EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 pexp<half2>(const half2& a) { half2 pexp<half2>(const half2& a) {
return h2exp(a); return h2exp(a);
} }
template<> __device__ EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 psqrt<half2>(const half2& a) { half2 psqrt<half2>(const half2& a) {
return h2sqrt(a); return h2sqrt(a);
} }
template<> __device__ EIGEN_STRONG_INLINE template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 prsqrt<half2>(const half2& a) { half2 prsqrt<half2>(const half2& a) {
return h2rsqrt(a); return h2rsqrt(a);
} }
#else #else
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = logf(a1); float r1 = logf(a1);
@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = expf(a1); float r1 = expf(a1);
@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = sqrtf(a1); float r1 = sqrtf(a1);
@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
return __floats2half2_rn(r1, r2); return __floats2half2_rn(r1, r2);
} }
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
float a1 = __low2float(a); float a1 = __low2float(a);
float a2 = __high2float(a); float a2 = __high2float(a);
float r1 = rsqrtf(a1); float r1 = rsqrtf(a1);
@ -361,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 16, size = 16,
HasHalfPacket = 0, HasHalfPacket = 0,
HasAdd = 0, HasAdd = 1,
HasSub = 0, HasSub = 1,
HasMul = 0, HasMul = 1,
HasNegate = 0, HasNegate = 1,
HasAbs = 0, HasAbs = 0,
HasAbs2 = 0, HasAbs2 = 0,
HasMin = 0, HasMin = 0,
@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
} }
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
_mm256_store_si256((__m256i*)to, from.x); // (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_store_si256((__m256i*)(void*)to, from.x);
} }
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
_mm256_storeu_si256((__m256i*)to, from.x); // (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_storeu_si256((__m256i*)(void*)to, from.x);
}
template<> EIGEN_STRONG_INLINE Packet16h
ploaddup<Packet16h>(const Eigen::half* from) {
Packet16h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
unsigned short e = from[4].x;
unsigned short f = from[5].x;
unsigned short g = from[6].x;
unsigned short h = from[7].x;
result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
return result;
} }
template<> EIGEN_STRONG_INLINE Packet16h template<> EIGEN_STRONG_INLINE Packet16h
@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif #endif
} }
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
Packet16f rf = pnegate(af);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a); Packet16f af = half2float(a);
Packet16f bf = half2float(b); Packet16f bf = half2float(b);
@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
return float2half(rf); return float2half(rf);
} }
template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = psub(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a); Packet16f af = half2float(a);
Packet16f bf = half2float(b); Packet16f bf = half2float(b);
@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
return half(predux(from_float)); return half(predux(from_float));
} }
template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux_mul(from_float));
}
template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
Packet16f pf[16];
pf[0] = half2float(p[0]);
pf[1] = half2float(p[1]);
pf[2] = half2float(p[2]);
pf[3] = half2float(p[3]);
pf[4] = half2float(p[4]);
pf[5] = half2float(p[5]);
pf[6] = half2float(p[6]);
pf[7] = half2float(p[7]);
pf[8] = half2float(p[8]);
pf[9] = half2float(p[9]);
pf[10] = half2float(p[10]);
pf[11] = half2float(p[11]);
pf[12] = half2float(p[12]);
pf[13] = half2float(p[13]);
pf[14] = half2float(p[14]);
pf[15] = half2float(p[15]);
Packet16f reduced = preduxp<Packet16f>(pf);
return float2half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
Packet16h res;
res.x = _mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
{
Packet16h res;
res.x = _mm256_insert_epi16(a.x,b.x,0);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
{
Packet16h res;
res.x = _mm256_insert_epi16(a.x,b.x,15);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
{ {
Packet16h result; Packet16h result;
@ -611,20 +831,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
// NOTE: no unpacklo/hi instr in this case, so using permute instr. // NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
kernel.packet[0].x = a_p_0; kernel.packet[0].x = a_p_0;
@ -729,10 +949,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
AlignedOnScalar = 1, AlignedOnScalar = 1,
size = 8, size = 8,
HasHalfPacket = 0, HasHalfPacket = 0,
HasAdd = 0, HasAdd = 1,
HasSub = 0, HasSub = 1,
HasMul = 0, HasMul = 1,
HasNegate = 0, HasNegate = 1,
HasAbs = 0, HasAbs = 0,
HasAbs2 = 0, HasAbs2 = 0,
HasMin = 0, HasMin = 0,
@ -781,6 +1001,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
} }
template<> EIGEN_STRONG_INLINE Packet8h
ploaddup<Packet8h>(const Eigen::half* from) {
Packet8h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
return result;
}
template<> EIGEN_STRONG_INLINE Packet8h template<> EIGEN_STRONG_INLINE Packet8h
ploadquad<Packet8h>(const Eigen::half* from) { ploadquad<Packet8h>(const Eigen::half* from) {
Packet8h result; Packet8h result;
@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
// FIXME we could do that with bit manipulation
Packet8f af = half2float(a);
Packet8f rf = pnegate(af);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a); Packet8f af = half2float(a);
Packet8f bf = half2float(b); Packet8f bf = half2float(b);
@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
return float2half(rf); return float2half(rf);
} }
template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = psub(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a); Packet8f af = half2float(a);
Packet8f bf = half2float(b); Packet8f bf = half2float(b);
@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
return Eigen::half(reduced); return Eigen::half(reduced);
} }
template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
Packet8f pf[8];
pf[0] = half2float(p[0]);
pf[1] = half2float(p[1]);
pf[2] = half2float(p[2]);
pf[3] = half2float(p[3]);
pf[4] = half2float(p[4]);
pf[5] = half2float(p[5]);
pf[6] = half2float(p[6]);
pf[7] = half2float(p[7]);
Packet8f reduced = preduxp<Packet8f>(pf);
return float2half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
Packet8h res;
res.x = _mm_shuffle_epi8(a.x,m);
return res;
}
template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
{
Packet8h res;
res.x = _mm_insert_epi16(a.x,int(b.x),0);
return res;
}
template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
{
Packet8h res;
res.x = _mm_insert_epi16(a.x,int(b.x),7);
return res;
}
template<int Offset>
struct palign_impl<Offset,Packet8h>
{
static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
{
if (Offset!=0)
first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
}
};
EIGEN_STRONG_INLINE void EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet8h,8>& kernel) { ptranspose(PacketBlock<Packet8h,8>& kernel) {
__m128i a = kernel.packet[0].x; __m128i a = kernel.packet[0].x;
@ -1129,4 +1420,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
} }
} }
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H #endif // EIGEN_PACKET_MATH_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed // Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_CUDA_H #ifndef EIGEN_TYPE_CASTING_GPU_H
#define EIGEN_TYPE_CASTING_CUDA_H #define EIGEN_TYPE_CASTING_GPU_H
namespace Eigen { namespace Eigen {
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type; typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(a); return __float2half(a);
#else #else
return Eigen::half(a); return Eigen::half(a);
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type; typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(static_cast<float>(a)); return __float2half(static_cast<float>(a));
#else #else
return Eigen::half(static_cast<float>(a)); return Eigen::half(static_cast<float>(a));
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type; typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(a); return __half2float(a);
#else #else
return static_cast<float>(a); return static_cast<float>(a);
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
template <> template <>
struct type_casting_traits<Eigen::half, float> { struct type_casting_traits<Eigen::half, float> {
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
} // end namespace Eigen } // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_CUDA_H #endif // EIGEN_TYPE_CASTING_GPU_H

View File

@ -0,0 +1,23 @@
/*
* math_constants.h -
* HIP equivalent of the CUDA header of the same name
*/
#ifndef __MATH_CONSTANTS_H__
#define __MATH_CONSTANTS_H__
/* single precision constants */
#define HIPRT_INF_F __int_as_float(0x7f800000)
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
#define HIPRT_ZERO_F 0.0f
#define HIPRT_ONE_F 1.0f
/* double precision constants */
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
#endif

View File

@ -0,0 +1,759 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Wave Computing, Inc.
// Written by:
// Chris Larsen
// Alexey Frunze (afrunze@wavecomp.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_MSA_H
#define EIGEN_COMPLEX_MSA_H
#include <iostream>
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {
}
EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
const std::complex<float>& b) {
Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
v = t;
}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
}
EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
}
EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
v = b.v;
return *this;
}
EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
}
EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
Packet4f v1, v2;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
// Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
// Multiply the real a with b
v1 = pmul(v1, b.v);
// Multiply the imag a with b
v2 = pmul(v2, b.v);
// Conjugate v2
v2 = Packet2cf(v2).conjugate().v;
// Swap real/imag elements in v2.
v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
// Add and return the result
v = padd(v1, v2);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
return Packet2cf(*this) *= b;
}
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
return Packet2cf(*this) += b;
}
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
return Packet2cf(*this) -= b;
}
EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
*this *= b.conjugate();
Packet4f s = pmul<Packet4f>(b.v, b.v);
s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
v = pdiv(v, s);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
return Packet2cf(*this) /= b;
}
EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
return Packet2cf(pnegate(v));
}
Packet4f v;
};
inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
os << "[ (" << value.v[0] << ", " << value.v[1]
<< "i),"
" ("
<< value.v[2] << ", " << value.v[3] << "i) ]";
return os;
}
template <>
struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0,
HasBlend = 1
};
};
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
enum { size = 2, alignment = Aligned16 };
typedef Packet2cf half;
};
template <>
EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
EIGEN_MSA_DEBUG;
float f0 = from.real(), f1 = from.imag();
Packet4f v0 = { f0, f0, f0, f0 };
Packet4f v1 = { f1, f1, f1, f1 };
return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
}
template <>
EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a + b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a - b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return -a;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return a.conjugate();
}
template <>
EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a * b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pand(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(por(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pxor(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pandnot(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
return pset1<Packet2cf>(*from);
}
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
}
template <>
EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
const std::complex<float>* from, Index stride) {
EIGEN_MSA_DEBUG;
return Packet2cf(from[0 * stride], from[1 * stride]);
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
const Packet2cf& from,
Index stride) {
EIGEN_MSA_DEBUG;
*to = std::complex<float>(from.v[0], from.v[1]);
to += stride;
*to = std::complex<float>(from.v[2], from.v[3]);
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
EIGEN_MSA_DEBUG;
prefetch(reinterpret_cast<const float*>(addr));
}
template <>
EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return std::complex<float>(a.v[0], a.v[1]);
}
template <>
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
Packet4f value = (Packet4f)preverse((Packet2d)a.v);
value += a.v;
return std::complex<float>(value[0], value[1]);
}
template <>
EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
EIGEN_MSA_DEBUG;
Packet4f sum1, sum2, sum;
// Add the first two 64-bit float32x2_t of vecs[0]
sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
sum = padd(sum1, sum2);
return Packet2cf(sum);
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
(a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
}
template <int Offset>
struct palign_impl<Offset, Packet2cf> {
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
if (Offset == 1) {
first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
}
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, false, true> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return internal::pmul(a, pconj(b));
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, true, false> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return internal::pmul(pconj(a), b);
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, true, true> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return pconj(internal::pmul(a, b));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
template <>
EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a / b;
}
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
return os;
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
EIGEN_MSA_DEBUG;
Packet4f tmp =
(Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
kernel.packet[0].v =
(Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
kernel.packet[1].v = tmp;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
const Packet2cf& elsePacket) {
return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
(Packet2d)elsePacket.v);
}
//---------- double ----------
struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {
}
EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
v[0] = std::real(a);
v[1] = std::imag(a);
}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
}
EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
}
EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
v = b.v;
return *this;
}
EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
}
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
Packet2d v1, v2;
// Get the real values of a | a1_re | a1_re
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
// Get the imag values of a | a1_im | a1_im
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
// Multiply the real a with b
v1 = pmul(v1, b.v);
// Multiply the imag a with b
v2 = pmul(v2, b.v);
// Conjugate v2
v2 = Packet1cd(v2).conjugate().v;
// Swap real/imag elements in v2.
v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
// Add and return the result
v = padd(v1, v2);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
return Packet1cd(*this) *= b;
}
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
return Packet1cd(*this) += b;
}
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
return Packet1cd(*this) -= b;
}
EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
*this *= b.conjugate();
Packet2d s = pmul<Packet2d>(b.v, b.v);
s = padd(s, preverse<Packet2d>(s));
v = pdiv(v, s);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
return Packet1cd(*this) /= b;
}
EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
return Packet1cd(pnegate(v));
}
Packet2d v;
};
inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
return os;
}
template <>
struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 1,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
enum { size = 1, alignment = Aligned16 };
typedef Packet1cd half;
};
template <>
EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
EIGEN_MSA_DEBUG;
return Packet1cd(from);
}
template <>
EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a + b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a - b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return -a;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return a.conjugate();
}
template <>
EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a * b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pand(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(por(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pxor(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pandnot(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
return pset1<Packet1cd>(*from);
}
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
EIGEN_MSA_DEBUG;
prefetch(reinterpret_cast<const double*>(addr));
}
template <>
EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
const std::complex<double>* from, Index stride __attribute__((unused))) {
EIGEN_MSA_DEBUG;
Packet1cd res;
res.v[0] = std::real(from[0]);
res.v[1] = std::imag(from[0]);
return res;
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
const Packet1cd& from,
Index stride
__attribute__((unused))) {
EIGEN_MSA_DEBUG;
pstore(to, from);
}
template <>
EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return std::complex<double>(a.v[0], a.v[1]);
}
template <>
EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return a;
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return pfirst(a);
}
template <>
EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
EIGEN_MSA_DEBUG;
return vecs[0];
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return pfirst(a);
}
template <int Offset>
struct palign_impl<Offset, Packet1cd> {
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
// boundary...
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, false, true> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return internal::pmul(a, pconj(b));
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, true, false> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return internal::pmul(pconj(a), b);
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, true, true> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return pconj(internal::pmul(a, b));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
template <>
EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a / b;
}
EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
EIGEN_MSA_DEBUG;
return Packet1cd(preverse(Packet2d(x.v)));
}
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
return os;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
EIGEN_MSA_DEBUG;
Packet2d v1, v2;
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
// Get the imag values of a
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
kernel.packet[0].v = v1;
kernel.packet[1].v = v2;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_MSA_H

View File

@ -0,0 +1,387 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// Copyright (C) 2018 Wave Computing, Inc.
// Written by:
// Chris Larsen
// Alexey Frunze (afrunze@wavecomp.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
/* The tanh function of this file is an adaptation of
* template<typename T> T generic_fast_tanh_float(const T&)
* from MathFunctionsImpl.h.
*/
#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
#define EIGEN_MATH_FUNCTIONS_MSA_H
namespace Eigen {
namespace internal {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
plog<Packet4f>(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
// Convert negative argument into NAN (quiet negative, to be specific).
Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN.
Packet4f x = non_neg_x_or_nan;
// Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
// N.B. the exponent is one less of what frexpf() would return.
Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
// Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
/*
if (x < SQRTHF) {
x = x + x - 1.0;
} else {
e += 1;
x = x - 1.0;
}
*/
Packet4f xx = padd(x, x);
Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
e_int = psub(e_int, ge_mask);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
x = psub(x, p4f_1);
Packet4f e = __builtin_msa_ffint_s_w(e_int);
Packet4f x2 = pmul(x, x);
Packet4f x3 = pmul(x2, x);
Packet4f y, y1, y2;
y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
y = pmadd(y, x, p4f_cephes_log_p2);
y1 = pmadd(y1, x, p4f_cephes_log_p5);
y2 = pmadd(y2, x, p4f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
y = pmadd(e, p4f_cephes_log_q1, y);
x = __builtin_msa_fmsub_w(x, x2, p4f_half);
x = padd(x, y);
x = pmadd(e, p4f_cephes_log_q2, x);
// x is now the logarithm result candidate. We still need to handle the
// extreme arguments of zero and positive infinity, though.
// N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
// contain infinities of both signs (see the coefficients and code above).
// INFINITY - INFINITY is NAN.
// If the argument is +INFINITY, make it the new result candidate.
// To achieve that we choose the smaller of the result candidate and the
// argument.
// This is correct for all finite pairs of values (the logarithm is smaller
// than the argument).
// This is also correct in the special case when the argument is +INFINITY
// and the result candidate is NAN. This is because the fmin.df instruction
// prefers non-NANs to NANs.
x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
// If the argument is zero (including -0.0), the result becomes -INFINITY.
Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
return x;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
pexp<Packet4f>(const Packet4f& _x) {
// Limiting single-precision pexp's argument to [-128, +128] lets pexp
// reach 0 and INFINITY naturally.
static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
Packet4f x = _x;
// Clamp x.
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
(v16u8)p4f_exp_lo);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
(v16u8)p4f_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
Packet4f z = pmul(x, x);
Packet4f y = p4f_cephes_exp_p0;
y = pmadd(y, x, p4f_cephes_exp_p1);
y = pmadd(y, x, p4f_cephes_exp_p2);
y = pmadd(y, x, p4f_cephes_exp_p3);
y = pmadd(y, x, p4f_cephes_exp_p4);
y = pmadd(y, x, p4f_cephes_exp_p5);
y = pmadd(y, z, x);
y = padd(y, p4f_1);
// y *= 2**exponent.
y = __builtin_msa_fexp2_w(y, x2_int);
return y;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
ptanh<Packet4f>(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
// The monomial coefficients of the numerator polynomial (odd).
static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
// The monomial coefficients of the denominator polynomial (even).
static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
Packet4f x = pabs(_x);
Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
// Clamp the inputs to the range [-9, 9] since anything outside
// this range is -/+1.0f in single-precision.
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
(v16u8)p4f_tanh_hi);
// Since the polynomials are odd/even, we need x**2.
Packet4f x2 = pmul(x, x);
// Evaluate the numerator polynomial p.
Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
p = pmadd(x2, p, p4f_alpha_9);
p = pmadd(x2, p, p4f_alpha_7);
p = pmadd(x2, p, p4f_alpha_5);
p = pmadd(x2, p, p4f_alpha_3);
p = pmadd(x2, p, p4f_alpha_1);
p = pmul(x, p);
// Evaluate the denominator polynomial q.
Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
q = pmadd(x2, q, p4f_beta_2);
q = pmadd(x2, q, p4f_beta_0);
// Divide the numerator by the denominator.
p = pdiv(p, q);
// Reinstate the sign.
p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
// When the argument is very small in magnitude it's more accurate to just return it.
p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
return p;
}
template <bool sine>
Packet4f psincos_inner_msa_float(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi).
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi.
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
Packet4f x = pabs(_x);
// Translate infinite arguments into NANs.
Packet4f zero_or_nan_if_inf = psub(_x, _x);
x = padd(x, zero_or_nan_if_inf);
// Prevent sin/cos from generating values larger than 1.0 in magnitude
// for very large arguments by setting x to 0.0.
Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
x = pand(x, (Packet4f)small_or_nan_mask);
// Scale x by 4/Pi to find x's octant.
Packet4f y = pmul(x, p4f_cephes_FOPI);
// Get the octant. We'll reduce x by this number of octants or by one more than it.
Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
y = __builtin_msa_ffint_s_w(y_int2);
// Compute the sign to apply to the polynomial.
Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
: __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
// Get the polynomial selection mask.
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
// The magic pass: "Extended precision modular arithmetic"
// x = ((x - y * DP1) - y * DP2) - y * DP3
Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
x = padd(x, tmp1);
x = padd(x, tmp2);
x = padd(x, tmp3);
// Evaluate the cos(x) polynomial.
y = p4f_coscof_p0;
Packet4f z = pmul(x, x);
y = pmadd(y, z, p4f_coscof_p1);
y = pmadd(y, z, p4f_coscof_p2);
y = pmul(y, z);
y = pmul(y, z);
y = __builtin_msa_fmsub_w(y, z, p4f_half);
y = padd(y, p4f_1);
// Evaluate the sin(x) polynomial.
Packet4f y2 = p4f_sincof_p0;
y2 = pmadd(y2, z, p4f_sincof_p1);
y2 = pmadd(y2, z, p4f_sincof_p2);
y2 = pmul(y2, z);
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynomials.
y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
: (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
// Update the sign.
sign_mask = pxor(sign_mask, (Packet4i)y);
y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
return y;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
psin<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ true>(x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
pcos<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ false>(x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
pexp<Packet2d>(const Packet2d& _x) {
// Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
// reach 0 and INFINITY naturally.
static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
Packet2d x = _x;
// Clamp x.
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
(v16u8)p2d_exp_lo);
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
(v16u8)p2d_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
x2 = pmul(x, x);
Packet2d px = p2d_cephes_exp_p0;
px = pmadd(px, x2, p2d_cephes_exp_p1);
px = pmadd(px, x2, p2d_cephes_exp_p2);
px = pmul(px, x);
Packet2d qx = p2d_cephes_exp_q0;
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
x = pdiv(px, psub(qx, px));
x = pmadd(p2d_2, x, p2d_1);
// x *= 2**exponent.
x = __builtin_msa_fexp2_d(x, x2_long);
return x;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_MSA_H

File diff suppressed because it is too large Load Diff

View File

@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{ {
float32x2_t r64; float32x2_t r64;
r64 = vld1_f32((float *)&from); r64 = vld1_f32((const float *)&from);
return Packet2cf(vcombine_f32(r64, r64)); return Packet2cf(vcombine_f32(r64, r64));
} }
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
} }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{ {
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
// TODO optimize it for NEON // TODO optimize it for NEON
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
s = vmulq_f32(b.v, b.v); s = vmulq_f32(b.v, b.v);
rev_s = vrev64q_f32(s); rev_s = vrev64q_f32(s);
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
} }
EIGEN_DEVICE_FUNC inline void EIGEN_DEVICE_FUNC inline void
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{ {
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
} }
}; };
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {
// TODO optimize it for NEON // TODO optimize it for NEON

View File

@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
return y; return y;
} }
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
_EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
Packet4i ux = vreinterpretq_s32_f32(x);
Packet4i emm0 = vshrq_n_s32(ux, 23);
/* keep only the fractional part */
ux = vandq_s32(ux, p4i_inv_mant_mask);
ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
x = vreinterpretq_f32_s32(ux);
emm0 = vsubq_s32(emm0, p4i_0x7f);
Packet4f e = vcvtq_f32_s32(emm0);
e = vaddq_f32(e, p4f_1);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, p4f_1);
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
x = vaddq_f32(x, tmp);
Packet4f z = vmulq_f32(x,x);
Packet4f y = p4f_cephes_log_p0;
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p5);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p6);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p7);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p8);
y = vmulq_f32(y, x);
y = vmulq_f32(y, z);
tmp = vmulq_f32(e, p4f_cephes_log_q1);
y = vaddq_f32(y, tmp);
tmp = vmulq_f32(z, p4f_half);
y = vsubq_f32(y, tmp);
tmp = vmulq_f32(e, p4f_cephes_log_q2);
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}
} // end namespace internal } // end namespace internal
} // end namespace Eigen } // end namespace Eigen

View File

@ -36,12 +36,43 @@ namespace internal {
#endif #endif
#endif #endif
#if EIGEN_COMP_MSVC
// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
template<typename T,int unique_id>
struct eigen_packet_wrapper
{
operator T&() { return m_val; }
operator const T&() const { return m_val; }
eigen_packet_wrapper() {}
eigen_packet_wrapper(const T &v) : m_val(v) {}
eigen_packet_wrapper& operator=(const T &v) {
m_val = v;
return *this;
}
T m_val;
};
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
#else
typedef float32x2_t Packet2f; typedef float32x2_t Packet2f;
typedef float32x4_t Packet4f; typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i; typedef int32x4_t Packet4i;
typedef int32x2_t Packet2i; typedef int32x2_t Packet2i;
typedef uint32x4_t Packet4ui; typedef uint32x4_t Packet4ui;
#endif // EIGEN_COMP_MSVC
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@ -51,14 +82,17 @@ typedef uint32x4_t Packet4ui;
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function #if EIGEN_ARCH_ARM64
// which available on LLVM and GCC (at least) // __builtin_prefetch tends to do nothing on ARM64 compilers because the
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC // prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld #elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif !EIGEN_ARCH_ARM64 #elif EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else #else
// by default no explicit prefetching // by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR) #define EIGEN_ARM_PREFETCH(ADDR)
@ -78,7 +112,7 @@ template<> struct packet_traits<float> : default_packet_traits
// FIXME check the Has* // FIXME check the Has*
HasSin = 0, HasSin = 0,
HasCos = 0, HasCos = 0,
HasLog = 0, HasLog = 1,
HasExp = 1, HasExp = 1,
HasSqrt = 0 HasSqrt = 0
}; };
@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from)
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{ {
const float32_t f[] = {0, 1, 2, 3}; const float f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f); Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown); return vaddq_f32(pset1<Packet4f>(a), countdown);
} }

View File

@ -0,0 +1,48 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_NEON_H
#define EIGEN_TYPE_CASTING_NEON_H
namespace Eigen {
namespace internal {
template <>
struct type_casting_traits<float, int> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template <>
struct type_casting_traits<int, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return vcvtq_s32_f32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return vcvtq_f32_s32(a);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_NEON_H

View File

@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
} }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{ {
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
} }
}; };
template<> struct conj_helper<Packet4f, Packet2cf, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{ {
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{ {
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
} }
}; };
template<> struct conj_helper<Packet2d, Packet1cd, false,false> EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{ {

View File

@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
} }
/* evaluation of 4 sines at onces, using SSE2 intrinsics. /* evaluation of 4 sines at once, using SSE2 intrinsics.
The code is the exact rewriting of the cephes sinf function. The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to Precision is excellent as long as x < 8192 (I did not bother to

View File

@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
} }
#if EIGEN_COMP_PGI
typedef const void * SsePrefetchPtrType;
#else
typedef const char * SsePrefetchPtrType;
#endif
#ifndef EIGEN_VECTORIZE_AVX #ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif #endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
// TODO try to call _mm_mul_epu32 directly // TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4]; EIGEN_ALIGN16 int aux[4];
pstore(aux, a); pstore(aux, a);
return (aux[0] * aux[1]) * (aux[2] * aux[3]);; return (aux[0] * aux[1]) * (aux[2] * aux[3]);
} }
// min // min
@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
} // end namespace Eigen } // end namespace Eigen
#if EIGEN_COMP_PGI
// PGI++ does not define the following intrinsics in C++ mode.
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
#endif
#endif // EIGEN_PACKET_MATH_SSE_H #endif // EIGEN_PACKET_MATH_SSE_H

View File

@ -14,6 +14,7 @@ namespace Eigen {
namespace internal { namespace internal {
#ifndef EIGEN_VECTORIZE_AVX
template <> template <>
struct type_casting_traits<float, int> { struct type_casting_traits<float, int> {
enum { enum {
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template <> template <>
struct type_casting_traits<int, float> { struct type_casting_traits<int, float> {
enum { enum {
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template <> template <>
struct type_casting_traits<double, float> { struct type_casting_traits<double, float> {
enum { enum {
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
}; };
}; };
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template <> template <>
struct type_casting_traits<float, double> { struct type_casting_traits<float, double> {
enum { enum {
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
TgtCoeffRatio = 2 TgtCoeffRatio = 2
}; };
}; };
#endif
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) { template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
// Simply discard the second half of the input // Simply discard the second half of the input

View File

@ -0,0 +1,104 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* InteropHeaders.h
*
* \brief:
* InteropHeaders
*
*****************************************************************/
#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
#define EIGEN_INTEROP_HEADERS_SYCL_H
#if defined EIGEN_USE_SYCL
namespace Eigen {
namespace internal {
#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
template<> struct packet_traits<unpacket_type> : default_packet_traits\
{\
typedef packet_type type;\
typedef packet_type half;\
enum {\
Vectorizable = 1,\
AlignedOnScalar = 1,\
size=lengths,\
HasHalfPacket = 0,\
HasDiv = 1,\
HasLog = 1,\
HasExp = 1,\
HasSqrt = 1,\
HasRsqrt = 1,\
HasSin = 1,\
HasCos = 1,\
HasTan = 1,\
HasASin = 1,\
HasACos = 1,\
HasATan = 1,\
HasSinh = 1,\
HasCosh = 1,\
HasTanh = 1,\
HasLGamma = 0,\
HasDiGamma = 0,\
HasZeta = 0,\
HasPolygamma = 0,\
HasErf = 0,\
HasErfc = 0,\
HasIGamma = 0,\
HasIGammac = 0,\
HasBetaInc = 0,\
HasBlend = val,\
HasMax=1,\
HasMin=1,\
HasMul=1,\
HasAdd=1,\
HasFloor=1,\
HasRound=1,\
HasLog1p=1,\
HasExpm1=1,\
HasCeil=1,\
};\
};
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
#undef SYCL_PACKET_TRAITS
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type> { enum { value = true }; };
SYCL_ARITHMETIC(cl::sycl::cl_float4)
SYCL_ARITHMETIC(cl::sycl::cl_double2)
#undef SYCL_ARITHMETIC
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
template<> struct unpacket_traits<packet_type> {\
typedef unpacket_type type;\
enum {size=lengths, alignment=Aligned16};\
typedef packet_type half;\
};
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
#undef SYCL_UNPACKET_TRAITS
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_USE_SYCL
#endif // EIGEN_INTEROP_HEADERS_SYCL_H

View File

@ -0,0 +1,221 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* MathFunctions.h
*
* \brief:
* MathFunctions
*
*****************************************************************/
#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
#define EIGEN_MATH_FUNCTIONS_SYCL_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
#define SYCL_PLOG(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
SYCL_PLOG(cl::sycl::cl_float4)
SYCL_PLOG(cl::sycl::cl_double2)
#undef SYCL_PLOG
#define SYCL_PLOG1P(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
SYCL_PLOG1P(cl::sycl::cl_float4)
SYCL_PLOG1P(cl::sycl::cl_double2)
#undef SYCL_PLOG1P
#define SYCL_PLOG10(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
SYCL_PLOG10(cl::sycl::cl_float4)
SYCL_PLOG10(cl::sycl::cl_double2)
#undef SYCL_PLOG10
#define SYCL_PEXP(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
SYCL_PEXP(cl::sycl::cl_float4)
SYCL_PEXP(cl::sycl::cl_double2)
#undef SYCL_PEXP
#define SYCL_PEXPM1(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
SYCL_PEXPM1(cl::sycl::cl_float4)
SYCL_PEXPM1(cl::sycl::cl_double2)
#undef SYCL_PEXPM1
#define SYCL_PSQRT(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
SYCL_PSQRT(cl::sycl::cl_float4)
SYCL_PSQRT(cl::sycl::cl_double2)
#undef SYCL_PSQRT
#define SYCL_PRSQRT(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
SYCL_PRSQRT(cl::sycl::cl_float4)
SYCL_PRSQRT(cl::sycl::cl_double2)
#undef SYCL_PRSQRT
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PSIN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psin<packet_type>(const packet_type& a) { return cl::sycl::sin(a); }
SYCL_PSIN(cl::sycl::cl_float4)
SYCL_PSIN(cl::sycl::cl_double2)
#undef SYCL_PSIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PCOS(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pcos<packet_type>(const packet_type& a) { return cl::sycl::cos(a); }
SYCL_PCOS(cl::sycl::cl_float4)
SYCL_PCOS(cl::sycl::cl_double2)
#undef SYCL_PCOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PTAN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type ptan<packet_type>(const packet_type& a) { return cl::sycl::tan(a); }
SYCL_PTAN(cl::sycl::cl_float4)
SYCL_PTAN(cl::sycl::cl_double2)
#undef SYCL_PTAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PASIN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pasin<packet_type>(const packet_type& a) { return cl::sycl::asin(a); }
SYCL_PASIN(cl::sycl::cl_float4)
SYCL_PASIN(cl::sycl::cl_double2)
#undef SYCL_PASIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PACOS(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pacos<packet_type>(const packet_type& a) { return cl::sycl::acos(a); }
SYCL_PACOS(cl::sycl::cl_float4)
SYCL_PACOS(cl::sycl::cl_double2)
#undef SYCL_PACOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PATAN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type patan<packet_type>(const packet_type& a) { return cl::sycl::atan(a); }
SYCL_PATAN(cl::sycl::cl_float4)
SYCL_PATAN(cl::sycl::cl_double2)
#undef SYCL_PATAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PSINH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psinh<packet_type>(const packet_type& a) { return cl::sycl::sinh(a); }
SYCL_PSINH(cl::sycl::cl_float4)
SYCL_PSINH(cl::sycl::cl_double2)
#undef SYCL_PSINH
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PCOSH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pcosh<packet_type>(const packet_type& a) { return cl::sycl::cosh(a); }
SYCL_PCOSH(cl::sycl::cl_float4)
SYCL_PCOSH(cl::sycl::cl_double2)
#undef SYCL_PCOSH
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PTANH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type ptanh<packet_type>(const packet_type& a) { return cl::sycl::tanh(a); }
SYCL_PTANH(cl::sycl::cl_float4)
SYCL_PTANH(cl::sycl::cl_double2)
#undef SYCL_PTANH
#define SYCL_PCEIL(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
SYCL_PCEIL(cl::sycl::cl_float4)
SYCL_PCEIL(cl::sycl::cl_double2)
#undef SYCL_PCEIL
#define SYCL_PROUND(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
SYCL_PROUND(cl::sycl::cl_float4)
SYCL_PROUND(cl::sycl::cl_double2)
#undef SYCL_PROUND
#define SYCL_FLOOR(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
SYCL_FLOOR(cl::sycl::cl_float4)
SYCL_FLOOR(cl::sycl::cl_double2)
#undef SYCL_FLOOR
#define SYCL_PMIN(packet_type, expr) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
#undef SYCL_PMIN
#define SYCL_PMAX(packet_type, expr) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
#undef SYCL_PMAX
//#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H

View File

@ -0,0 +1,458 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* PacketMath.h
*
* \brief:
* PacketMath
*
*****************************************************************/
#ifndef EIGEN_PACKET_MATH_SYCL_H
#define EIGEN_PACKET_MATH_SYCL_H
#include <type_traits>
#if defined EIGEN_USE_SYCL
namespace Eigen {
namespace internal {
#define SYCL_PLOADT_RO(address_space_target)\
template<typename packet_type, int Alignment>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
typedef typename unpacket_traits<packet_type>::type scalar;\
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
return res;\
}
SYCL_PLOADT_RO(global_space)
SYCL_PLOADT_RO(local_space)
#undef SYCL_PLOADT_RO
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
return ploadt_ro<packet_type, Alignment>(from);\
}
// global space
SYCL_PLOAD(global_space, Unaligned, u)
SYCL_PLOAD(global_space, Aligned, )
// local space
SYCL_PLOAD(local_space, Unaligned, u)
SYCL_PLOAD(local_space, Aligned, )
// private space
//SYCL_PLOAD(private_space, Unaligned, u)
//SYCL_PLOAD(private_space, Aligned, )
#undef SYCL_PLOAD
/** \internal \returns a packet version of \a *from.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
#define SYCL_PLOADT(address_space_target)\
template<typename packet_type, int Alignment>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
{\
if(Alignment >= unpacket_traits<packet_type>::alignment)\
return pload<packet_type>(from);\
else\
return ploadu<packet_type>(from);\
}
// global space
SYCL_PLOADT(global_space)
// local space
SYCL_PLOADT(local_space)
//private_space
// There is no need to specialise it for private space as it can use the GenericPacketMath version
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
typedef typename unpacket_traits<packet_type>::type scalar;\
auto res=packet_type(static_cast<scalar>(0));\
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
return res;\
}
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
typedef typename unpacket_traits<packet_type>::type scalar;\
auto res=packet_type(static_cast<scalar>(0));\
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
return res;\
}
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
#undef SYCL_PLOAD_SPECIAL
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
template<>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
const packet_type& from) {\
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
from.store(0, multi_ptr(to));\
}
// global space
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
template<>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
scalar* to,\
const packet_type& from) {\
if(Alignment)\
pstore(to, from);\
else\
pstoreu(to,from);\
}
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
#undef SYCL_PSTORE_T
#define SYCL_PSET1(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
const typename unpacket_traits<packet_type>::type& from) {\
return packet_type(from);\
}
// global space
SYCL_PSET1(cl::sycl::cl_float4)
SYCL_PSET1(cl::sycl::cl_double2)
#undef SYCL_PSET1
template <typename packet_type> struct get_base_packet {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
};
template <> struct get_base_packet <cl::sycl::cl_float4> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
auto tmp = stride;
to[0] = from.x();
to[tmp] = from.y();
to[tmp += stride] = from.z();
to[tmp += stride] = from.w();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
}
};
template <> struct get_base_packet <cl::sycl::cl_double2> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
return cl::sycl::cl_double2(from[0], from[0]);
}
template <typename sycl_multi_pointer, typename Index>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
to[0] = from.x();
to[stride] = from.y();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
}
};
#define SYCL_PLOAD_DUP(address_space_target)\
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
{\
return get_base_packet<packet_type>::get_ploaddup(from); \
}
// global space
SYCL_PLOAD_DUP(global_space)
// local_space
SYCL_PLOAD_DUP(local_space)
// private_space
//SYCL_PLOAD_DUP(private_space)
#undef SYCL_PLOAD_DUP
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
{ \
return get_base_packet<packet_type>::get_ploaddup(from); \
}
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
#undef SYCL_PLOAD_DUP_SPECILIZE
#define SYCL_PLSET(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
return get_base_packet<packet_type>::set_plset(a);\
}
SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2)
#undef SYCL_PLSET
#define SYCL_PGATHER(address_space_target)\
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
// global space
SYCL_PGATHER(global_space)
// local space
SYCL_PGATHER(local_space)
// private space
//SYCL_PGATHER(private_space)
#undef SYCL_PGATHER
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
{ \
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PGATHER_SPECILIZE
#define SYCL_PSCATTER(address_space_target)\
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t to,\
const packet_type& from, Index stride) {\
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
}
// global space
SYCL_PSCATTER(global_space)
// local space
SYCL_PSCATTER(local_space)
// private space
//SYCL_PSCATTER(private_space)
#undef SYCL_PSCATTER
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
{ \
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
}
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PSCATTER_SPECILIZE
#define SYCL_PMAD(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
const packet_type& b, const packet_type& c){\
return cl::sycl::mad(a,b,c);\
}
SYCL_PMAD(cl::sycl::cl_float4)
SYCL_PMAD(cl::sycl::cl_double2)
#undef SYCL_PMAD
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() + a.y() + a.z() + a.w();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() + a.y();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmax(a.x(), a.y());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmin(a.x(), a.y());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() * a.y() * a.z() * a.w();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() * a.y();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
float tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
// std::swap(kernel.packet[0].y(), kernel.packet[1].x());
tmp = kernel.packet[0].z();
kernel.packet[0].z() = kernel.packet[2].x();
kernel.packet[2].x() = tmp;
//std::swap(kernel.packet[0].z(), kernel.packet[2].x());
tmp = kernel.packet[0].w();
kernel.packet[0].w() = kernel.packet[3].x();
kernel.packet[3].x() = tmp;
//std::swap(kernel.packet[0].w(), kernel.packet[3].x());
tmp = kernel.packet[1].z();
kernel.packet[1].z() = kernel.packet[2].y();
kernel.packet[2].y() = tmp;
// std::swap(kernel.packet[1].z(), kernel.packet[2].y());
tmp = kernel.packet[1].w();
kernel.packet[1].w() = kernel.packet[3].y();
kernel.packet[3].y() = tmp;
// std::swap(kernel.packet[1].w(), kernel.packet[3].y());
tmp = kernel.packet[2].w();
kernel.packet[2].w() = kernel.packet[3].z();
kernel.packet[3].z() = tmp;
// std::swap(kernel.packet[2].w(), kernel.packet[3].z());
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
double tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
//std::swap(kernel.packet[0].y(), kernel.packet[1].x());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
ifPacket.select[1] ? 0 : -1,
ifPacket.select[2] ? 0 : -1,
ifPacket.select[3] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
template<> inline cl::sycl::cl_double2
pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
ifPacket.select[1] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_USE_SYCL
#endif // EIGEN_PACKET_MATH_SYCL_H

Some files were not shown because too many files have changed in this diff Show More