merge with default Eigen

2025-07-29 16:22:03 +08:00 · 2018-09-21 11:51:49 +02:00 · 2018-09-21 11:51:49 +02:00 · a488d59787
commit a488d59787
parent 47720e7970 3ec2985914
717 changed files with 39462 additions and 10488 deletions
--- a/.hgignore
+++ b/.hgignore
@ -13,7 +13,7 @@ core
 core.*
 *.bak
 *~
-build*
+*build*
 *.moc.*
 *.moc
 ui_*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
  message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
 endif()
 # Alias Eigen_*_DIR to Eigen3_*_DIR:
 set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
 set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
 set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
-# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
+# if we are not in a mercurial clone
-# but won't stop CMake.
+if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
-execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
+  # if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
-execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
+  # but won't stop CMake.
  execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
  execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
 endif()
 # if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
 if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
  option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
 endif()
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_CURRENT_DIR OFF)
 option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
@ -153,11 +157,7 @@ if(NOT MSVC)
  ei_add_cxx_compiler_flag("-Wdouble-promotion")
 #  ei_add_cxx_compiler_flag("-Wconversion")
-  # -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
+  ei_add_cxx_compiler_flag("-Wshadow")
  # if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
  if(NOT CMAKE_COMPILER_IS_GNUCXX)
    ei_add_cxx_compiler_flag("-Wshadow")
  endif()
  ei_add_cxx_compiler_flag("-Wno-psabi")
  ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@ -232,7 +232,10 @@ if(NOT MSVC)
  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
  if(EIGEN_TEST_AVX512)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512")
    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
    endif()
    message(STATUS "Enabling AVX512 in tests/examples")
  endif()
@ -254,6 +257,12 @@ if(NOT MSVC)
    message(STATUS "Enabling VSX in tests/examples")
  endif()
  option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
  if(EIGEN_TEST_MSA)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
    message(STATUS "Enabling MSA in tests/examples")
  endif()
  option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
  if(EIGEN_TEST_NEON)
    if(EIGEN_TEST_FMA)
@ -271,12 +280,18 @@ if(NOT MSVC)
    message(STATUS "Enabling NEON in tests/examples")
  endif()
-  option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
+  option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
-  if(EIGEN_TEST_ZVECTOR)
+  if(EIGEN_TEST_Z13)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
  endif()
  option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
  if(EIGEN_TEST_Z14)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
    message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
  endif()
  check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
  if(COMPILER_SUPPORT_OPENMP)
    option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@ -363,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
 set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 # Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
 if(EIGEN_INCLUDE_INSTALL_DIR)
@ -437,10 +452,17 @@ endif()
 # add SYCL
 option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
 option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
 if(EIGEN_TEST_SYCL)
  set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
-  include(FindComputeCpp)
+  if(EIGEN_SYCL_TRISYCL)
-endif()
+    message(STATUS "Using triSYCL")
    include(FindTriSYCL)
  else(EIGEN_SYCL_TRISYCL)
    message(STATUS "Using ComputeCPP SYCL")
    include(FindComputeCpp)
  endif(EIGEN_SYCL_TRISYCL)
 endif(EIGEN_TEST_SYCL)
 add_subdirectory(unsupported)
@ -516,6 +538,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
  # Imported target support
  add_library (eigen INTERFACE)
  add_library (Eigen3::Eigen ALIAS eigen)
  target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
  target_include_directories (eigen INTERFACE
--- a/CTestConfig.cmake
+++ b/CTestConfig.cmake
@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http")
 set(CTEST_DROP_SITE "manao.inria.fr")
 set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
 set(CTEST_DROP_SITE_CDASH TRUE)
-set(CTEST_PROJECT_SUBPROJECTS
+#set(CTEST_PROJECT_SUBPROJECTS
-Official
+#Official
-Unsupported
+#Unsupported
-)
+#)
--- a/CTestCustom.cmake.in
+++ b/CTestCustom.cmake.in
@ -1,3 +1,4 @@
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
 set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS   "2000")
 list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION    @EIGEN_CTEST_ERROR_EXCEPTION@)
--- a/Eigen/Cholesky
+++ b/Eigen/Cholesky
@ -9,6 +9,7 @@
 #define EIGEN_CHOLESKY_MODULE_H
 #include "Core"
 #include "Jacobi"
 #include "src/Core/util/DisableStupidWarnings.h"
@ -31,7 +32,11 @@
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
 #include "src/Cholesky/LLT_LAPACKE.h"
 #endif
--- a/Eigen/Core
+++ b/Eigen/Core
@ -14,61 +14,26 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
-// Handle NVCC/CUDA/SYCL
+// then include this file where all our macros are defined. It's really important to do it first because
-#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+// it's where we do all the compiler/OS/arch detections and define most defaults.
-  // Do not try asserts on CUDA and SYCL!
+#include "src/Core/util/Macros.h"
  #ifndef EIGEN_NO_DEBUG
  #define EIGEN_NO_DEBUG
  #endif
-  #ifdef EIGEN_INTERNAL_DEBUGGING
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
-  #undef EIGEN_INTERNAL_DEBUGGING
+#include "src/Core/util/ConfigureVectorization.h"
  #endif
-  #ifdef EIGEN_EXCEPTIONS
+// We need cuda_runtime.h/hip_runtime.h to ensure that
-  #undef EIGEN_EXCEPTIONS
+// the EIGEN_USING_STD_MATH macro works properly on the device side
-  #endif
+#if defined(EIGEN_CUDACC)
-
+  #include <cuda_runtime.h>
-  // All functions callable from CUDA code must be qualified with __device__
+#elif defined(EIGEN_HIPCC)
-  #ifdef __CUDACC__
+  #include <hip/hip_runtime.h>
    // Do not try to vectorize on CUDA and SYCL!
    #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
    #endif
    #define EIGEN_DEVICE_FUNC __host__ __device__
    // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
    // works properly on the device side
    #include <math_functions.hpp>
  #else
    #define EIGEN_DEVICE_FUNC
  #endif
 #else
  #define EIGEN_DEVICE_FUNC
 #endif
 // When compiling CUDA device code with NVCC, pull in math functions from the
 // global namespace.  In host mode, and when device doee with clang, use the
 // std versions.
 #if defined(__CUDA_ARCH__) && defined(__NVCC__)
  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
 #else
  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
 #endif
 #if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
  #define EIGEN_EXCEPTIONS
 #endif
 #ifdef EIGEN_EXCEPTIONS
  #include <new>
 #endif
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
 #include "src/Core/util/Macros.h"
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@ -81,169 +46,9 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 // if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
 // account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
 #if EIGEN_MAX_ALIGN_BYTES==0
  #ifndef EIGEN_DONT_VECTORIZE
    #define EIGEN_DONT_VECTORIZE
  #endif
 #endif
-#if EIGEN_COMP_MSVC
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #define EIGEN_HAS_GPU_FP16
  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
    // Remember that usage of defined() in a #define is undefined by the standard.
    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
    #endif
  #endif
 #else
  // Remember that usage of defined() in a #define is undefined by the standard
  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
  #endif
 #endif
 #ifndef EIGEN_DONT_VECTORIZE
  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
    // Defines symbols for compile-time detection of which instructions are
    // used.
    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_SSE
    #define EIGEN_VECTORIZE_SSE2
    // Detect sse3/ssse3/sse4:
    // gcc and icc defines __SSE3__, ...
    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
    // want to force the use of those instructions with msvc.
    #ifdef __SSE3__
      #define EIGEN_VECTORIZE_SSE3
    #endif
    #ifdef __SSSE3__
      #define EIGEN_VECTORIZE_SSSE3
    #endif
    #ifdef __SSE4_1__
      #define EIGEN_VECTORIZE_SSE4_1
    #endif
    #ifdef __SSE4_2__
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __AVX__
      #define EIGEN_VECTORIZE_AVX
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __AVX2__
      #define EIGEN_VECTORIZE_AVX2
      #define EIGEN_VECTORIZE_AVX
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
    #endif
    #ifdef __FMA__
      #define EIGEN_VECTORIZE_FMA
    #endif
    #if defined(__AVX512F__)
      #define EIGEN_VECTORIZE_AVX512
      #define EIGEN_VECTORIZE_AVX2
      #define EIGEN_VECTORIZE_AVX
      #define EIGEN_VECTORIZE_FMA
      #define EIGEN_VECTORIZE_SSE3
      #define EIGEN_VECTORIZE_SSSE3
      #define EIGEN_VECTORIZE_SSE4_1
      #define EIGEN_VECTORIZE_SSE4_2
      #ifdef __AVX512DQ__
        #define EIGEN_VECTORIZE_AVX512DQ
      #endif
    #endif
    // include files
    // This extern "C" works around a MINGW-w64 compilation issue
    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
      #if EIGEN_COMP_ICC >= 1110
        #include <immintrin.h>
      #else
        #include <mmintrin.h>
        #include <emmintrin.h>
        #include <xmmintrin.h>
        #ifdef  EIGEN_VECTORIZE_SSE3
        #include <pmmintrin.h>
        #endif
        #ifdef EIGEN_VECTORIZE_SSSE3
        #include <tmmintrin.h>
        #endif
        #ifdef EIGEN_VECTORIZE_SSE4_1
        #include <smmintrin.h>
        #endif
        #ifdef EIGEN_VECTORIZE_SSE4_2
        #include <nmmintrin.h>
        #endif
        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
        #include <immintrin.h>
        #endif
      #endif
    } // end extern "C"
  #elif defined __VSX__
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_VSX
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
    // => use __vector instead of vector
    #undef bool
    #undef vector
    #undef pixel
  #elif defined __ALTIVEC__
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ALTIVEC
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
    // => use __vector instead of vector
    #undef bool
    #undef vector
    #undef pixel
  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_NEON
    #include <arm_neon.h>
  #elif (defined __s390x__ && defined __VEC__)
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ZVECTOR
    #include <vecintrin.h>
  #endif
 #endif
 #if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
  // We can use the optimized fp16 to float and float to fp16 conversion routines
  #define EIGEN_HAS_FP16_C
 #endif
 #if defined __CUDACC__
  #define EIGEN_VECTORIZE_CUDA
  #include <vector_types.h>
  #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
    #define EIGEN_HAS_CUDA_FP16
  #endif
 #endif
 #if defined EIGEN_HAS_CUDA_FP16
  #include <host_defines.h>
  #include <cuda_fp16.h>
 #endif
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@ -275,6 +80,10 @@
 // for min/max:
 #include <algorithm>
 #if EIGEN_HAS_CXX11
 #include <array>
 #endif
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@ -299,38 +108,6 @@
  #include <SYCL/sycl.hpp>
 #endif
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {
 inline static const char *SimdInstructionSetsInUse(void) {
 #if defined(EIGEN_VECTORIZE_AVX512)
  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_AVX)
  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_2)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
 #elif defined(EIGEN_VECTORIZE_SSSE3)
  return "SSE, SSE2, SSE3, SSSE3";
 #elif defined(EIGEN_VECTORIZE_SSE3)
  return "SSE, SSE2, SSE3";
 #elif defined(EIGEN_VECTORIZE_SSE2)
  return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
  return "AltiVec";
 #elif defined(EIGEN_VECTORIZE_VSX)
  return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
  return "ARM NEON";
 #elif defined(EIGEN_VECTORIZE_ZVECTOR)
  return "S390X ZVECTOR";
 #else
  return "None";
 #endif
 }
 } // end namespace Eigen
 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
 namespace Eigen {
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@ -366,11 +143,11 @@ using std::ptrdiff_t;
 #include "src/Core/util/IntegralConstant.h"
 #include "src/Core/util/SymbolicIndex.h"
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
 #if defined EIGEN_VECTORIZE_AVX512
  #include "src/Core/arch/SSE/PacketMath.h"
@ -388,6 +165,7 @@ using std::ptrdiff_t;
  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX/Complex.h"
  #include "src/Core/arch/AVX/TypeCasting.h"
  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
  #include "src/Core/arch/SSE/PacketMath.h"
  #include "src/Core/arch/SSE/MathFunctions.h"
@ -401,22 +179,33 @@ using std::ptrdiff_t;
  #include "src/Core/arch/NEON/PacketMath.h"
  #include "src/Core/arch/NEON/MathFunctions.h"
  #include "src/Core/arch/NEON/Complex.h"
  #include "src/Core/arch/NEON/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
  #include "src/Core/arch/ZVector/PacketMath.h"
  #include "src/Core/arch/ZVector/MathFunctions.h"
  #include "src/Core/arch/ZVector/Complex.h"
 #elif defined EIGEN_VECTORIZE_MSA
  #include "src/Core/arch/MSA/PacketMath.h"
  #include "src/Core/arch/MSA/MathFunctions.h"
  #include "src/Core/arch/MSA/Complex.h"
 #endif
 // Half float support
-#include "src/Core/arch/CUDA/Half.h"
+#include "src/Core/arch/GPU/Half.h"
-#include "src/Core/arch/CUDA/PacketMathHalf.h"
+#include "src/Core/arch/GPU/PacketMathHalf.h"
-#include "src/Core/arch/CUDA/TypeCasting.h"
+#include "src/Core/arch/GPU/TypeCasting.h"
-#if defined EIGEN_VECTORIZE_CUDA
+#if defined EIGEN_VECTORIZE_GPU
-  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/GPU/PacketMath.h"
-  #include "src/Core/arch/CUDA/MathFunctions.h"
+  #include "src/Core/arch/GPU/MathFunctions.h"
 #endif
 #if defined EIGEN_VECTORIZE_SYCL
  #include "src/Core/arch/SYCL/InteropHeaders.h"
  #include "src/Core/arch/SYCL/PacketMath.h"
  #include "src/Core/arch/SYCL/MathFunctions.h"
  #include "src/Core/arch/SYCL/TypeCasting.h"
 #endif
 #include "src/Core/arch/Default/Settings.h"
 #include "src/Core/functors/TernaryFunctors.h"
@ -428,7 +217,9 @@ using std::ptrdiff_t;
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
 #ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
 #endif
 #include "src/Core/util/IndexedViewHelper.h"
 #include "src/Core/util/ReshapedHelper.h"
--- a/Eigen/Eigenvalues
+++ b/Eigen/Eigenvalues
@ -10,14 +10,14 @@
 #include "Core"
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"
 #include "src/Core/util/DisableStupidWarnings.h"
 /** \defgroup Eigenvalues_Module Eigenvalues module
  *
  *
@ -45,7 +45,11 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
 #include "src/Eigenvalues/RealSchur_LAPACKE.h"
 #include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
 #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
--- a/Eigen/Geometry
+++ b/Eigen/Geometry
@ -10,12 +10,12 @@
 #include "Core"
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "SVD"
 #include "LU"
 #include <limits>
 #include "src/Core/util/DisableStupidWarnings.h"
 /** \defgroup Geometry_Module Geometry module
  *
  * This module provides support for:
--- a/Eigen/KLUSupport
+++ b/Eigen/KLUSupport
@ -0,0 +1,41 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_KLUSUPPORT_MODULE_H
 #define EIGEN_KLUSUPPORT_MODULE_H
 #include <Eigen/SparseCore>
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 extern "C" {
 #include <btf.h>
 #include <klu.h>
   }
 /** \ingroup Support_modules
  * \defgroup KLUSupport_Module KLUSupport module
  *
  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the following factorization class:
  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
  *
  * \code
  * #include <Eigen/KLUSupport>
  * \endcode
  *
  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
  * The dependencies depend on how umfpack has been compiled.
  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
  *
  */
 #include "src/KLUSupport/KLUSupport.h"
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 #endif // EIGEN_KLUSUPPORT_MODULE_H
--- a/Eigen/LU
+++ b/Eigen/LU
@ -28,7 +28,11 @@
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
 #include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
--- a/Eigen/PaStiXSupport
+++ b/Eigen/PaStiXSupport
@ -36,6 +36,7 @@ extern "C" {
  * \endcode
  *
  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
  * The dependencies depend on how PaSTiX has been compiled.
  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
  *
--- a/Eigen/PardisoSupport
+++ b/Eigen/PardisoSupport
--- a/Eigen/QR
+++ b/Eigen/QR
@ -10,12 +10,12 @@
 #include "Core"
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "src/Core/util/DisableStupidWarnings.h"
 /** \defgroup QR_Module QR module
  *
  *
@ -36,7 +36,11 @@
 #include "src/QR/ColPivHouseholderQR.h"
 #include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
 #include "src/QR/HouseholderQR_LAPACKE.h"
 #include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
--- a/Eigen/QtAlignedMalloc
+++ b/Eigen/QtAlignedMalloc
@ -27,7 +27,7 @@ void qFree(void *ptr)
 void *qRealloc(void *ptr, std::size_t size)
 {
  void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+  std::memcpy(newPtr, ptr, size);
  Eigen::internal::aligned_free(ptr);
  return newPtr;
 }
--- a/Eigen/SVD
+++ b/Eigen/SVD
@ -37,7 +37,11 @@
 #include "src/SVD/JacobiSVD.h"
 #include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #ifdef EIGEN_USE_MKL
 #include "mkl_lapacke.h"
 #else
 #include "src/misc/lapacke.h"
 #endif
 #include "src/SVD/JacobiSVD_LAPACKE.h"
 #endif
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@ -23,6 +23,8 @@
 // Ordering interface
 #include "OrderingMethods"
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 #include "src/SparseLU/SparseLU_Structs.h"
@ -43,4 +45,6 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
 #include "src/Core/util/ReenableStupidWarnings.h"
 #endif // EIGEN_SPARSELU_MODULE_H
--- a/Eigen/SparseQR
+++ b/Eigen/SparseQR
@ -28,7 +28,6 @@
  * 
  */
 #include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
--- a/Eigen/src/Cholesky/LDLT.h
+++ b/Eigen/src/Cholesky/LDLT.h
@ -247,8 +247,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      *          \c NumericalIssue if the factorization failed because of a zero pivot.
      */
    ComputationInfo info() const
    {
@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
    #endif
@ -376,6 +375,8 @@ template<> struct ldlt_inplace<Lower>
      if((rs>0) && pivot_is_valid)
        A21 /= realAkk;
      else if(rs>0)
        ret = ret && (A21.array()==Scalar(0)).all();
      if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
      else if(!pivot_is_valid) found_zero_pivot = true;
@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
  // more precisely, use pseudo-inverse of D (see bug 241)
  using std::abs;
  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
-  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
-  // as motivated by LAPACK's xGELSS:
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
  // diagonal element is not well justified and leads to numerical issues in some cases.
  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  // Using numeric_limits::min() gives us more robustness to denormals.
  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
  for (Index i = 0; i < vecD.size(); ++i)
  {
--- a/Eigen/src/Cholesky/LLT.h
+++ b/Eigen/src/Cholesky/LLT.h
@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  *
  * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
  * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
+  *               The other triangular part won't be read.
  *
  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
  * matrix A such that A = LL^* = U^*U, where L is lower triangular.
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
  * Example: \include LLT_example.cpp
  * Output: \verbinclude LLT_example.out
  *
  * \b Performance: for best performance, it is recommended to use a column-major storage format
  * with the Lower triangular part (the default), or, equivalently, a row-major storage format
  * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
  * step, and rank-updates can be up to 3 times slower.
  *
  * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
  *
  * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
  * Therefore, the strict lower part does not have to store correct values.
  *
  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
  */
 /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
  * the strict lower part does not have to store correct values.
  */
 template<typename _MatrixType, int _UpLo> class LLT
 {
  public:
@ -96,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
      compute(matrix.derived());
    }
-    /** \brief Constructs a LDLT factorization from a given matrix
+    /** \brief Constructs a LLT factorization from a given matrix
      *
      * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
      * \c MatrixType is a Eigen::Ref.
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
    }
    template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
+    void solveInPlace(const MatrixBase<Derived> &bAndX) const;
    template<typename InputType>
    LLT& compute(const EigenBase<InputType>& matrix);
@ -176,8 +180,8 @@ template<typename _MatrixType, int _UpLo> class LLT
    /** \brief Reports whether previous computation was successful.
      *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
+      *          \c NumericalIssue if the matrix.appears not to be positive definite.
      */
    ComputationInfo info() const
    {
@ -196,11 +200,10 @@ template<typename _MatrixType, int _UpLo> class LLT
    inline Index cols() const { return m_matrix.cols(); }
    template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename RhsType, typename DstType>
    EIGEN_DEVICE_FUNC
    void _solve_impl(const RhsType &rhs, DstType &dst) const;
    #endif
@ -425,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
  eigen_assert(a.rows()==a.cols());
  const Index size = a.rows();
  m_matrix.resize(size, size);
-  m_matrix = a.derived();
+  if (!internal::is_same_dense(m_matrix, a.derived()))
    m_matrix = a.derived();
  // Compute matrix L1 norm = max abs column sum.
  m_l1_norm = RealScalar(0);
@ -454,7 +458,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
  */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
  eigen_assert(v.size()==m_matrix.cols());
@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
  *
  * This version avoids a copy when the right hand side matrix b is not needed anymore.
  *
  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
  * This function will const_cast it, so constness isn't honored here.
  *
  * \sa LLT::solve(), MatrixBase::llt()
  */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
+void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
 {
  eigen_assert(m_isInitialized && "LLT is not initialized.");
  eigen_assert(m_matrix.rows()==bAndX.rows());
--- a/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/Eigen/src/CholmodSupport/CholmodSupport.h
@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
-namespace Eigen { 
+namespace Eigen {
 namespace internal {
@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
  res.dtype   = 0;
  res.stype   = -1;
-  
+
  if (internal::is_same<_StorageIndex,int>::value)
  {
    res.itype = CHOLMOD_INT;
  }
-  else if (internal::is_same<_StorageIndex,long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
  {
    res.itype = CHOLMOD_LONG;
  }
@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
  // setup res.xtype
  internal::cholmod_configure_matrix<_Scalar>::run(res);
-  
+
  res.stype = 0;
-  
+
  return res;
 }
@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-  
+
  if(UpLo==Upper) res.stype =  1;
  if(UpLo==Lower) res.stype = -1;
  // swap stype for rowmajor matrices (only works for real matrices)
@ -167,12 +167,12 @@ namespace internal {
 // template specializations for int and long that call the correct cholmod method
 #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
-    template<typename _StorageIndex> ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
-    template<>                       ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
 #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
-    template<typename _StorageIndex> ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
-    template<>                       ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
+    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
 EIGEN_CHOLMOD_SPECIALIZE0(int, start)
 EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
 EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
-template<typename _StorageIndex> cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
-template<>                       cholmod_dense*  cm_solve<long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
-template<typename _StorageIndex> cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
-template<>                       cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
 template<typename _StorageIndex>
-int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
 template<>
-int  cm_factorize_p<long> (cholmod_sparse*  A, double beta[2], long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
 #undef EIGEN_CHOLMOD_SPECIALIZE0
 #undef EIGEN_CHOLMOD_SPECIALIZE1
@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
      internal::cm_finish<StorageIndex>(m_cholmod);
    }
-    
+
    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-    
+
    /** \brief Reports whether previous computation was successful.
      *
      * \returns \c Success if computation was successful,
@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
      factorize(matrix);
      return derived();
    }
-    
+
    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
      *
      * This function is particularly useful when solving for several problems having the same structure.
-      * 
+      *
      * \sa factorize()
      */
    void analyzePattern(const MatrixType& matrix)
@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
      }
      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
-      
+
      this->m_isInitialized = true;
      this->m_info = Success;
      m_analysisIsOk = true;
      m_factorizationIsOk = false;
    }
-    
+
    /** Performs a numeric decomposition of \a matrix
      *
      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
      m_factorizationIsOk = true;
    }
-    
+
    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
     *  See the Cholmod user guide for details. */
    cholmod_common& cholmod() { return m_cholmod; }
-    
+
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    /** \internal */
    template<typename Rhs,typename Dest>
@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      const Index size = m_cholmodFactor->n;
      EIGEN_UNUSED_VARIABLE(size);
      eigen_assert(size==b.rows());
-      
+
      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
      Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
    }
-    
+
    /** \internal */
    template<typename RhsDerived, typename DestDerived>
    void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
    }
    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
+
-    
+
    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
      *
      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
      m_shiftOffset[0] = double(offset);
      return derived();
    }
-    
+
    /** \returns the determinant of the underlying matrix from the current factorization */
    Scalar determinant() const
    {
@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
    template<typename Stream>
    void dumpMemory(Stream& /*s*/)
    {}
-    
+
  protected:
    mutable cholmod_common m_cholmod;
    cholmod_factor* m_cholmodFactor;
@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLLT() : Base() { init(); }
    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSimplicialLDLT() : Base() { init(); }
    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodSupernodalLLT() : Base() { init(); }
    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
    using Base::m_cholmod;
-    
+
  public:
-    
+
    typedef _MatrixType MatrixType;
-    
+
    CholmodDecomposition() : Base() { init(); }
    CholmodDecomposition(const MatrixType& matrix) : Base()
@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
    }
    ~CholmodDecomposition() {}
-    
+
    void setMode(CholmodMode mode)
    {
      switch(mode)
--- a/Eigen/src/Core/ArithmeticSequence.h
+++ b/Eigen/src/Core/ArithmeticSequence.h
@ -29,17 +29,17 @@ template<int N> struct aseq_negate<FixedInt<N> > {
 template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
 template<typename FirstType,typename SizeType,typename IncrType,
-         bool FirstIsSymbolic=Symbolic::is_symbolic<FirstType>::value,
+         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
-         bool SizeIsSymbolic =Symbolic::is_symbolic<SizeType>::value>
+         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
 struct aseq_reverse_first_type {
  typedef Index type;
 };
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
-  typedef Symbolic::AddExpr<FirstType,
+  typedef symbolic::AddExpr<FirstType,
-                            Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
+                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  Symbolic::ValueExpr<IncrType> >
+                                                  symbolic::ValueExpr<IncrType> >
                           > type;
 };
@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_i
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
-  typedef Symbolic::AddExpr<FirstType,Symbolic::ValueExpr<Aux> > type;
+  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
 };
 template<typename FirstType,typename SizeType,typename IncrType>
 struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
-  typedef Symbolic::AddExpr<Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
+  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  Symbolic::ValueExpr<IncrType> >,
+                                                  symbolic::ValueExpr<IncrType> >,
-                            Symbolic::ValueExpr<> > type;
+                            symbolic::ValueExpr<> > type;
 };
 #endif
@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr)
               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
              CleanedIncrType(incr));
 }
-#else
+
 #else // EIGEN_HAS_CXX11
 template<typename FirstType,typename LastType>
-typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
 seq(FirstType f, LastType l)
 {
@ -237,35 +238,35 @@ seq(FirstType f, LastType l)
 }
 template<typename FirstTypeDerived,typename LastType>
-typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived, Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,Symbolic::ValueExpr<> >,
+    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
-                                                            Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
 {
  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
 }
 template<typename FirstType,typename LastTypeDerived>
-typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
+                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                          Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
+                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l)
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
 {
  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
 }
 template<typename FirstTypeDerived,typename LastTypeDerived>
 ArithmeticSequence<FirstTypeDerived,
-                    Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::NegateExpr<FirstTypeDerived> >,Symbolic::ValueExpr<internal::FixedInt<1> > > >
+                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
 {
  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
 }
 template<typename FirstType,typename LastType, typename IncrType>
-typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
+typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
 seq(FirstType f, LastType l, IncrType incr)
 {
@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr)
 }
 template<typename FirstTypeDerived,typename LastType, typename IncrType>
-typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
    ArithmeticSequence<FirstTypeDerived,
-                        Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
-                                                                                   Symbolic::ValueExpr<> >,
+                                                                                   symbolic::ValueExpr<> >,
-                                                                 Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                              Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
 {
  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
 }
 template<typename FirstType,typename LastTypeDerived, typename IncrType>
-typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
+typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
+                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                                                 Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                               Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 {
  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
 ArithmeticSequence<FirstTypeDerived,
-                    Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,
+                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
-                                                                               Symbolic::NegateExpr<FirstTypeDerived> >,
+                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
-                                                             Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                          Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
+                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
                    typename internal::cleanup_seq_incr<IncrType>::type>
-seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
+seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
 {
  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
 }
-#endif
+#endif // EIGEN_HAS_CXX11
 #endif // EIGEN_PARSED_BY_DOXYGEN
 #if EIGEN_HAS_CXX11
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
  * 
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template<typename SizeType,typename IncrType>
 auto lastN(SizeType size, IncrType incr)
 -> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
 {
  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
 }
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
  *
  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
  * 
  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
 template<typename SizeType>
 auto lastN(SizeType size)
 -> decltype(seqN(Eigen::last+fix<1>()-size, size))
 {
  return seqN(Eigen::last+fix<1>()-size, size);
 }
 #endif
 namespace internal {
 // Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
 template<typename T>
 struct make_size_type {
-  typedef typename internal::conditional<Symbolic::is_symbolic<T>::value, Index, T>::type type;
+  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
 };
 template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
@ -345,6 +375,39 @@ struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
 } // end namespace internal
 /** \namespace Eigen::indexing
  * \ingroup Core_Module
  * 
  * The sole purpose of this namespace is to be able to import all functions
  * and symbols that are expected to be used within operator() for indexing
  * and slicing. If you already imported the whole Eigen namespace:
  * \code using namespace Eigen; \endcode
  * then you are already all set. Otherwise, if you don't want/cannot import
  * the whole Eigen namespace, the following line:
  * \code using namespace Eigen::indexing; \endcode
  * is equivalent to:
  * \code
  using Eigen::all;
  using Eigen::seq;
  using Eigen::seqN;
  using Eigen::lastN; // c++11 only
  using Eigen::last;
  using Eigen::lastp1;
  using Eigen::fix;
  \endcode
  */
 namespace indexing {
  using Eigen::all;
  using Eigen::seq;
  using Eigen::seqN;
  #if EIGEN_HAS_CXX11
  using Eigen::lastN;
  #endif
  using Eigen::last;
  using Eigen::lastp1;
  using Eigen::fix;
 }
 } // end namespace Eigen
 #endif // EIGEN_ARITHMETIC_SEQUENCE_H
--- a/Eigen/src/Core/Array.h
+++ b/Eigen/src/Core/Array.h
@ -231,10 +231,16 @@ class Array
            : Base(other)
    { }
  private:
    struct PrivateType {};
  public:
    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
+    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
                              typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
                                                           PrivateType>::type = PrivateType())
      : Base(other.derived())
    { }
--- a/Eigen/src/Core/ArrayBase.h
+++ b/Eigen/src/Core/ArrayBase.h
@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/ArrayWrapper.h
+++ b/Eigen/src/Core/ArrayWrapper.h
@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
    EIGEN_DEVICE_FUNC
    inline void evalTo(Dest& dst) const { dst = m_expression; }
    const typename internal::remove_all<NestedExpressionType>::type& 
    EIGEN_DEVICE_FUNC
    const typename internal::remove_all<NestedExpressionType>::type& 
    nestedExpression() const 
    {
      return m_expression;
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
  // Let's remove NestByRefBit
  enum {
    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
  };
 };
 }
--- a/Eigen/src/Core/Assign.h
+++ b/Eigen/src/Core/Assign.h
@ -16,7 +16,7 @@ namespace Eigen {
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
  ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
  enum{
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@ -39,7 +39,7 @@ public:
  enum {
    DstAlignment = DstEvaluator::Alignment,
    SrcAlignment = SrcEvaluator::Alignment,
-    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
  };
@ -83,7 +83,7 @@ private:
                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
                       && (EIGEN_UNALIGNED_VECTORIZE  || int(JointAlignment)>=int(InnerRequiredAlignment)),
    MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
-    MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
                       && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
         so it's only good for large enough sizes. */
@ -97,7 +97,7 @@ private:
 public:
  enum {
-    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
+    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
              : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;
-// Assignement kind defined in this file:
+// Assignment kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};
@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
    src.evalTo(dst);
  }
-  // NOTE The following two functions are templated to avoid their instanciation if not needed
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
  template<typename SrcScalarType>
  EIGEN_DEVICE_FUNC
--- a/Eigen/src/Core/Assign_MKL.h
+++ b/Eigen/src/Core/Assign_MKL.h
@ -84,7 +84,8 @@ class vml_assign_traits
  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>,   \
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {              \
    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                   \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                       \
      resize_if_allowed(dst, src, func);                                                                                        \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> {            \
    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested,                                           \
                    const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType;                         \
-    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) {                 \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) {                     \
      resize_if_allowed(dst, src, func);                                                                                      \
      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other);                                       \
      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
  * \sa any(), Cwise::operator<()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::all() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
  * \sa all()
  */
 template<typename Derived>
-inline bool DenseBase<Derived>::any() const
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
 {
  typedef internal::evaluator<Derived> Evaluator;
  enum {
@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
  * \sa all(), any()
  */
 template<typename Derived>
-inline Eigen::Index DenseBase<Derived>::count() const
+EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
 {
  return derived().template cast<bool>().template cast<Index>().sum();
 }
--- a/Eigen/src/Core/CommaInitializer.h
+++ b/Eigen/src/Core/CommaInitializer.h
@ -141,7 +141,7 @@ struct CommaInitializer
  * \sa CommaInitializer::finished(), class CommaInitializer
  */
 template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
  return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-inline CommaInitializer<Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@ -134,19 +134,21 @@ private:
 // this helper permits to completely eliminate m_outerStride if it is known at compiletime.
 template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
  {
-    EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
+#ifndef EIGEN_INTERNAL_DEBUGGING
    EIGEN_UNUSED_VARIABLE(outerStride);
 #endif
    eigen_internal_assert(outerStride==OuterStride);
  }
-  Index outerStride() const { return OuterStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
  const Scalar *data;
 };
 template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
 public:
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  Index outerStride() const { return m_outerStride; }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
  const Scalar *data;
 protected:
  Index m_outerStride;
@ -1034,7 +1036,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
                             ? int(outer_stride_at_compile_time<ArgType>::ret)
                             : int(inner_stride_at_compile_time<ArgType>::ret),
-    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@ -1044,7 +1046,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
                             && (OuterStrideAtCompileTime!=0)
                             && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
  };
  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@ -1075,14 +1079,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
    : m_argImpl(block.nestedExpression()), 
      m_startRow(block.startRow()), 
-      m_startCol(block.startCol()) 
+      m_startCol(block.startCol()),
      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
  { }
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  enum {
-    RowsAtCompileTime = XprType::RowsAtCompileTime
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
  };
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1094,7 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index index) const
  { 
-    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
      return m_argImpl.coeff(m_linear_offset.value() + index); 
    else
      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1106,7 +1115,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index index)
  { 
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+    if (ForwardLinearAccess)
      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
    else
      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
  }
  template<int LoadMode, typename PacketType>
@ -1120,8 +1132,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_STRONG_INLINE
  PacketType packet(Index index) const 
  { 
-    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+    if (ForwardLinearAccess)
-                                       RowsAtCompileTime == 1 ? index : 0);
+      return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
    else
      return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                         RowsAtCompileTime == 1 ? index : 0);
  }
  template<int StoreMode, typename PacketType>
@ -1135,15 +1150,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
  EIGEN_STRONG_INLINE
  void writePacket(Index index, const PacketType& x) 
  {
-    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+    if (ForwardLinearAccess)
-                                             RowsAtCompileTime == 1 ? index : 0,
+      return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
-                                             x);
+    else
      return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
                                              RowsAtCompileTime == 1 ? index : 0,
                                              x);
  }
 protected:
  evaluator<ArgType> m_argImpl;
  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
 };
 // TODO: This evaluator does not actually use the child evaluator; 
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@ -158,7 +158,7 @@ public:
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
  */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 #endif // EIGEN_CWISE_BINARY_OP_H
--- a/Eigen/src/Core/CwiseNullaryOp.h
+++ b/Eigen/src/Core/CwiseNullaryOp.h
@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
  */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index size, const Scalar& value)
 {
  return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
  * \sa class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(const Scalar& value)
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
  * \sa LinSpaced(Scalar,Scalar)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
  *
  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
 template<typename Derived>
-bool DenseBase<Derived>::isConstant
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
 (const Scalar& val, const RealScalar& prec) const
 {
  return isApproxToConstant(val, prec);
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
  * \sa setConstant(), Constant(), class CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
 {
  setConstant(val);
 }
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
 {
  return derived() = Constant(rows(), cols(), val);
 }
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 {
  resize(size);
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
  resize(rows, cols);
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
  * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
  * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return setLinSpaced(size(), low, high);
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
  * \sa Zero(), Zero(Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(0));
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
  * \sa Zero(), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero(Index size)
 {
  return Constant(size, Scalar(0));
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
  * \sa Zero(Index), Zero(Index,Index)
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Zero()
 {
  return Constant(Scalar(0));
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
  typename internal::nested_eval<Derived,1>::type self(derived());
  for(Index j = 0; j < cols(); ++j)
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
  * \sa class CwiseNullaryOp, Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
 {
  return setConstant(Scalar(0));
 }
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index newSize)
 {
  resize(newSize);
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
  resize(rows, cols);
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
  * \sa Ones(), Ones(Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index rows, Index cols)
 {
  return Constant(rows, cols, Scalar(1));
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones(Index newSize)
 {
  return Constant(newSize, Scalar(1));
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Ones()
 {
  return Constant(Scalar(1));
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-bool DenseBase<Derived>::isOnes
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
 (const RealScalar& prec) const
 {
  return isApproxToConstant(Scalar(1), prec);
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
  * \sa class CwiseNullaryOp, Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
 {
  return setConstant(Scalar(1));
 }
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index newSize)
 {
  resize(newSize);
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
 PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
  resize(rows, cols);
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
  * \sa Identity(), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
  * \sa Identity(Index,Index), setIdentity(), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
 MatrixBase<Derived>::Identity()
 {
  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 {
  return internal::setIdentity_impl<Derived>::run(derived());
 }
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
  derived().resize(rows, cols);
  return setIdentity();
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
  return BasisReturnType(SquareMatrixType::Identity(),i);
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
 { return Derived::Unit(0); }
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
 { return Derived::Unit(1); }
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
 { return Derived::Unit(2); }
 /** \returns an expression of the W axis unit vector (0,0,0,1)
@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 /** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
  *
  * \param i index of the unique coefficient to be set to 1
  *
  * \only_for_vectors
  *
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
  eigen_assert(i<size());
  derived().setZero();
  derived().coeffRef(i) = Scalar(1);
  return derived();
 }
 /** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
  *
  * \param newSize the new size of the vector
  * \param i index of the unique coefficient to be set to 1
  *
  * \only_for_vectors
  *
  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
  */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
 {
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
  eigen_assert(i<newSize);
  derived().resize(newSize);
  return setUnit(i);
 }
 } // end namespace Eigen
 #endif // EIGEN_CWISE_NULLARY_OP_H
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
          * we are dealing with a column-vector (if there is only one column) or with
          * a row-vector (if there is only one row). */
      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
         * and 2 for matrices.
         */
      Flags = internal::traits<Derived>::Flags,
        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
          * constructed from this one. See the \ref flags "list of flags".
@ -296,7 +301,7 @@ template<typename Derived> class DenseBase
    EIGEN_DEVICE_FUNC
    Derived& operator=(const ReturnByValue<OtherDerived>& func);
-    /** \ínternal
+    /** \internal
      * Copies \a other into *this without evaluating other. \returns a reference to *this.
      * \deprecated */
    template<typename OtherDerived>
@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
      * a const reference, in order to avoid a useless copy.
      * 
-      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
      */
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE EvalReturnType eval() const
@ -484,9 +489,9 @@ template<typename Derived> class DenseBase
      return derived().coeff(0,0);
    }
-    bool all() const;
+    EIGEN_DEVICE_FUNC bool all() const;
-    bool any() const;
+    EIGEN_DEVICE_FUNC bool any() const;
-    Index count() const;
+    EIGEN_DEVICE_FUNC Index count() const;
    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@ -61,7 +61,7 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
  template<typename PtrType>
@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
      EIGEN_UNUSED_VARIABLE(rows);
      EIGEN_UNUSED_VARIABLE(cols);
    }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
      numext::swap(m_data, other.m_data);
    }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
      numext::swap(m_data,other.m_data);
      numext::swap(m_rows,other.m_rows);
      numext::swap(m_cols,other.m_cols);
    }
    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
      return *this; 
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
    {
      numext::swap(m_data,other.m_data);
      numext::swap(m_rows,other.m_rows);
    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
      return *this;
    }
    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
      numext::swap(m_data,other.m_data);
      numext::swap(m_cols,other.m_cols);
    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
+      numext::swap(m_data, other.m_data);
-      swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
-      swap(m_rows, other.m_rows);
+      numext::swap(m_cols, other.m_cols);
      swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
+    {
      numext::swap(m_data,other.m_data);
      numext::swap(m_rows,other.m_rows);
      numext::swap(m_cols,other.m_cols);
    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    void conservativeResize(Index size, Index rows, Index cols)
@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
+      numext::swap(m_data, other.m_data);
-      swap(m_data, other.m_data);
+      numext::swap(m_cols, other.m_cols);
      swap(m_cols, other.m_cols);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
      numext::swap(m_data,other.m_data);
      numext::swap(m_cols,other.m_cols);
    }
    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
    EIGEN_DEVICE_FUNC
    DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
    {
-      using std::swap;
+      numext::swap(m_data, other.m_data);
-      swap(m_data, other.m_data);
+      numext::swap(m_rows, other.m_rows);
      swap(m_rows, other.m_rows);
      return *this;
    }
 #endif
    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
      numext::swap(m_data,other.m_data);
      numext::swap(m_rows,other.m_rows);
    }
    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
    void conservativeResize(Index size, Index rows, Index)
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
    EIGEN_DEVICE_FUNC
-    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
    {
      eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
    }
    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
  *
  * \sa class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
  return DiagonalReturnType(derived());
@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal(). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
  return DiagonalDynamicIndexReturnType(derived(), index);
@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
  return ConstDiagonalDynamicIndexReturnType(derived(), index);
@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
--- a/Eigen/src/Core/DiagonalMatrix.h
+++ b/Eigen/src/Core/DiagonalMatrix.h
@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
    EIGEN_DEVICE_FUNC
    DenseMatrixType toDenseMatrix() const { return derived(); }
-    
+
    EIGEN_DEVICE_FUNC
    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
    EIGEN_DEVICE_FUNC
@ -273,7 +273,7 @@ class DiagonalWrapper
  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
  **/
 template<typename Derived>
-inline const DiagonalWrapper<const Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
  return DiagonalWrapper<const Derived>(derived());
--- a/Eigen/src/Core/DiagonalProduct.h
+++ b/Eigen/src/Core/DiagonalProduct.h
@ -17,7 +17,7 @@ namespace Eigen {
  */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const Product<Derived, DiagonalDerived, LazyProduct>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@ -31,7 +31,8 @@ struct dot_nocheck
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.template binaryExpr<conj_prod>(b).sum();
  }
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
  typedef typename conj_prod::result_type ResScalar;
  EIGEN_DEVICE_FUNC
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
+  EIGEN_STRONG_INLINE
  static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
  {
    return a.transpose().template binaryExpr<conj_prod>(b).sum();
  }
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE
 typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@ -90,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
  * \sa dot(), norm(), lpNorm()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
  return numext::real((*this).cwiseAbs2().sum());
 }
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
  * \sa lpNorm(), dot(), squaredNorm()
  */
 template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
  return numext::sqrt(squaredNorm());
 }
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
  * \sa norm(), normalize()
  */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
  typedef typename internal::nested_eval<Derived,2>::type _Nested;
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
  * \sa norm(), normalized()
  */
 template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
  RealScalar z = squaredNorm();
  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
  * \sa stableNorm(), stableNormalize(), normalized()
  */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
  typedef typename internal::nested_eval<Derived,3>::type _Nested;
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
  * \sa stableNorm(), stableNormalized(), normalize()
  */
 template<typename Derived>
-inline void MatrixBase<Derived>::stableNormalize()
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
  RealScalar w = cwiseAbs().maxCoeff();
  RealScalar z = (derived()/w).squaredNorm();
@ -257,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-MatrixBase<Derived>::RealScalar
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
--- a/Eigen/src/Core/EigenBase.h
+++ b/Eigen/src/Core/EigenBase.h
@ -14,6 +14,7 @@
 namespace Eigen {
 /** \class EigenBase
  * \ingroup Core_Module
  * 
  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
  *
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
  */
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived());
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
--- a/Eigen/src/Core/Fuzzy.h
+++ b/Eigen/src/Core/Fuzzy.h
@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
  */
 template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const typename NumTraits<Scalar>::Real& other,
  const RealScalar& prec
 ) const
@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
  */
 template<typename Derived>
 template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
  const DenseBase<OtherDerived>& other,
  const RealScalar& prec
 ) const
--- a/Eigen/src/Core/GeneralProduct.h
+++ b/Eigen/src/Core/GeneralProduct.h
@ -18,18 +18,33 @@ enum {
  Small = 3
 };
 // Define the threshold value to fallback from the generic matrix-matrix product
 // implementation (heavy) to the lightweight coeff-based product one.
 // See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
 // in products/GeneralMatrixMatrix.h for more details.
 // TODO This threshold should also be used in the compile-time selector below.
 #ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
 // This default value has been obtained on a Haswell architecture.
 #define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
 #endif
 namespace internal {
 template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
-  enum { is_large = MaxSize == Dynamic ||
+  enum {
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+    #ifndef EIGEN_GPU_COMPILE_PHASE
-                    (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+    is_large = MaxSize == Dynamic ||
-         value = is_large  ? Large
+               Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
-               : Size == 1 ? 1
+               (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
-                           : Small
+    #else
    is_large = 0,
    #endif
    value = is_large  ? Large
          : Size == 1 ? 1
                      : Small
  };
 };
@ -148,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
 };
 template<typename Scalar,int Size,int MaxSize>
@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
  *
  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
  */
 #ifndef __CUDACC__
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
 inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
  return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 #endif // __CUDACC__
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
  *
  * The returned product will behave like any other expressions: the coefficients of the product will be
@ -428,7 +440,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 template<typename Derived>
 template<typename OtherDerived>
 const Product<Derived,OtherDerived,LazyProduct>
-MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
  enum {
    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@ -82,7 +82,11 @@ struct default_packet_traits
    HasPolygamma = 0,
    HasErf = 0,
    HasErfc = 0,
    HasI0e = 0,
    HasI1e = 0,
    HasIGamma = 0,
    HasIGammaDerA = 0,
    HasGammaSampleDerAlpha = 0,
    HasIGammac = 0,
    HasBetaInc = 0,
@ -231,7 +235,7 @@ pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(
  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
  * Currently, this function is only used for scalar * complex products.
  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 /** \internal \returns a packet with elements of \a *from quadrupled.
@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
 }
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
 plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  // do nothing
 #elif defined(EIGEN_CUDA_ARCH)
 #if defined(__LP64__)
  // 64-bit pointer operand constraint for inlined asm
  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }
-/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
  * For packet-size smaller or equal to 4, this boils down to a noop.
  */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_downto4(const Packet& a)
+predux_half_dowto4(const Packet& a)
 { return a; }
 /** \internal \returns the product of the elements of \a a*/
@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
  * by the current computation.
  */
 template<typename Packet, int LoadMode>
-inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
 {
  return ploadt<Packet, LoadMode>(from);
 }
@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
 ***************************************************************************/
 // Eigen+CUDA does not support complexes.
-#ifndef __CUDACC__
+#if !defined(EIGEN_GPUCC)
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
--- a/Eigen/src/Core/GlobalFunctions.h
+++ b/Eigen/src/Core/GlobalFunctions.h
@ -66,6 +66,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
@ -89,7 +90,7 @@ namespace Eigen
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-  
+
  /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
    *
    * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@ -103,17 +104,18 @@ namespace Eigen
  inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
-  template<typename Derived,typename ScalarExponent>
+  template <typename Derived,typename ScalarExponent>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
+  EIGEN_DEVICE_FUNC inline
-          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
+    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
-    return x.derived().pow(exponent);
+                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
-  }
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
-
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
-  template<typename Derived>
+  {
-  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
-    return x.derived().pow(exponent);
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
  }
 #endif
@ -123,21 +125,21 @@ namespace Eigen
    *
    * Example: \include Cwise_array_power_array.cpp
    * Output: \verbinclude Cwise_array_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
    */
  template<typename Derived,typename ExponentDerived>
  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
  {
    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
      x.derived(),
      exponents.derived()
    );
  }
-  
+
  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
    *
    * This function computes the coefficient-wise power between a scalar and an array of exponents.
@ -146,7 +148,7 @@ namespace Eigen
    *
    * Example: \include Cwise_scalar_power_array.cpp
    * Output: \verbinclude Cwise_scalar_power_array.out
-    * 
+    *
    * \sa ArrayBase::pow()
    *
    * \relates ArrayBase
@ -156,21 +158,17 @@ namespace Eigen
  inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
  pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
-  template<typename Scalar, typename Derived>
+  template <typename Scalar, typename Derived>
-  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
+  EIGEN_DEVICE_FUNC inline
-          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
+  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
-  {
+                                                 EIGEN_COMMA Scalar EIGEN_COMMA
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
+                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
-            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
-  }
+    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
-
+                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
-  template<typename Derived>
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
-  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
+           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
  {
    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
  }
 #endif
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
 {
  typedef traits<PlainObjectType> TraitsBase;
  enum {
    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
                             ? PlainObjectType::ColsAtCompileTime
                             : PlainObjectType::RowsAtCompileTime,
    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                             ? int(PlainObjectType::InnerStrideAtCompileTime)
                             : int(StrideType::InnerStrideAtCompileTime),
    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
                                ? Dynamic
                                : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
                             : int(StrideType::OuterStrideAtCompileTime),
    Alignment = int(MapOptions)&int(AlignedMask),
    Flags0 = TraitsBase::Flags & (~NestByRefBit),
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
    inline Index outerStride() const
    {
      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
-           : int(Flags)&RowMajorBit ? this->cols()
+           : IsVectorAtCompileTime ? (this->size() * innerStride())
-           : this->rows();
+           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
           : (this->rows() * innerStride());
    }
    /** Constructor in the fixed-size case.
--- a/Eigen/src/Core/MapBase.h
+++ b/Eigen/src/Core/MapBase.h
@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    enum {
      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
      InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
      SizeAtCompileTime = Base::SizeAtCompileTime
    };
@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
    {
 #if EIGEN_MAX_ALIGN_BYTES>0
      // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
      const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
      EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
      eigen_assert((   ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
-                    || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
+                    || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
 #endif
    }
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
-#ifdef __CUDA_ARCH__
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@ -238,7 +238,7 @@ struct imag_ref_retval
 ****************************************************************************/
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
+struct conj_default_impl
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
@ -248,7 +248,7 @@ struct conj_impl
 };
 template<typename Scalar>
-struct conj_impl<Scalar,true>
+struct conj_default_impl<Scalar,true>
 {
  EIGEN_DEVICE_FUNC
  static inline Scalar run(const Scalar& x)
@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
  }
 };
 template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
 #if defined(EIGEN_GPU_COMPILE_PHASE)
 template<typename T>
 struct conj_impl<std::complex<T> >
 {
  EIGEN_DEVICE_FUNC
  static inline std::complex<T> run(const std::complex<T>& x)
  {
    return std::complex<T>(x.real(), -x.imag());
  }
 };
 #endif
 template<typename Scalar>
 struct conj_retval
 {
@ -347,31 +361,7 @@ struct norm1_retval
 * Implementation of hypot                                                *
 ****************************************************************************/
-template<typename Scalar>
+template<typename Scalar> struct hypot_impl;
 struct hypot_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static inline RealScalar run(const Scalar& x, const Scalar& y)
  {
    EIGEN_USING_STD_MATH(abs);
    EIGEN_USING_STD_MATH(sqrt);
    RealScalar _x = abs(x);
    RealScalar _y = abs(y);
    Scalar p, qp;
    if(_x>_y)
    {
      p = _x;
      qp = _y / p;
    }
    else
    {
      p = _y;
      qp = _x / p;
    }
    if(p==RealScalar(0)) return RealScalar(0);
    return p * sqrt(RealScalar(1) + qp*qp);
  }
 };
 template<typename Scalar>
 struct hypot_retval
@ -445,7 +435,12 @@ struct round_retval
  struct arg_impl {
    static inline Scalar run(const Scalar& x)
    {
      #if defined(EIGEN_HIP_DEVICE_COMPILE)
      // HIP does not seem to have a native device side implementation for the math routine "arg"
      using std::arg;
      #else 		  
      EIGEN_USING_STD_MATH(arg);
      #endif
      return arg(x);
    }
  };
@ -497,11 +492,11 @@ namespace std_fallback {
    EIGEN_USING_STD_MATH(exp);
    Scalar u = exp(x);
-    if (u == Scalar(1)) {
+    if (numext::equal_strict(u, Scalar(1))) {
      return x;
    }
    Scalar um1 = u - RealScalar(1);
-    if (um1 == Scalar(-1)) {
+    if (numext::equal_strict(um1, Scalar(-1))) {
      return RealScalar(-1);
    }
@ -512,7 +507,7 @@ namespace std_fallback {
 template<typename Scalar>
 struct expm1_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
@ -543,13 +538,13 @@ namespace std_fallback {
    typedef typename NumTraits<Scalar>::Real RealScalar;
    EIGEN_USING_STD_MATH(log);
    Scalar x1p = RealScalar(1) + x;
-    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
  }
 }
 template<typename Scalar>
 struct log1p_impl {
-  static inline Scalar run(const Scalar& x)
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
  {
    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
    #if EIGEN_HAS_CXX11_MATH
@ -689,20 +684,27 @@ struct random_default_impl<Scalar, false, true>
 {
  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if (y <= x)
    if(y<x)
      return x;
-    // the following difference might overflow on a 32 bits system,
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
-    // but since y>=x the result converted to an unsigned long is still correct.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
-    std::size_t range = ScalarX(y)-ScalarX(x);
+    // ScalarX is the widest of ScalarU and unsigned int.
-    std::size_t offset = 0;
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
-    // rejection sampling
+    // types and arithmetic and signed overflows (which are undefined behavior).
-    std::size_t divisor = 1;
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
-    std::size_t multiplier = 1;
+    // The following difference doesn't overflow, provided our integer types are two's
-    if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
+    // complement and have the same number of padding bits in signed and unsigned variants.
-    else               multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
+    // This is the case in most modern implementations of C++.
    ScalarX range = ScalarX(y) - ScalarX(x);
    ScalarX offset = 0;
    ScalarX divisor = 1;
    ScalarX multiplier = 1;
    const unsigned rand_max = RAND_MAX;
    if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
    else                   multiplier = 1 + range / (rand_max + 1);
    // Rejection sampling.
    do {
-      offset = (std::size_t(std::rand()) * multiplier) / divisor;
+      offset = (unsigned(std::rand()) * multiplier) / divisor;
    } while (offset > range);
    return Scalar(ScalarX(x) + offset);
  }
@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
-// Implementatin of is* functions
+// Implementation of is* functions
 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
    return (::isfinite)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isfinite;
@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
    return (::isinf)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isinf;
@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #ifdef __CUDA_ARCH__
+  #if defined(EIGEN_GPU_COMPILE_PHASE)
    return (::isnan)(x);
  #elif EIGEN_USE_STD_FPCLASSIFY
    using std::isnan;
@ -874,7 +876,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 namespace numext {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) 
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -890,84 +892,6 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
  EIGEN_USING_STD_MATH(max);
  return max EIGEN_NOT_A_MACRO (x,y);
 }
 #elif defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
 {
  return y < x ? y : x;
 }
 template<typename T>
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
 {
  return x < y ? y : x;
 }
 EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y)
 {
  return cl::sycl::min(x,y);
 }
 EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y)
 {
  return cl::sycl::max(x,y);
 }
 EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y)
 {
  return cl::sycl::min(x,y);
 }
 EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y)
 {
  return cl::sycl::max(x,y);
 }
 EIGEN_ALWAYS_INLINE  long mini(const long & x, const long & y)
 {
  return cl::sycl::min(x,y);
 }
 EIGEN_ALWAYS_INLINE  long maxi(const long & x, const long & y)
 {
  return cl::sycl::max(x,y);
 }
 EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y)
 {
  return cl::sycl::min(x,y);
 }
 EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y)
 {
  return cl::sycl::max(x,y);
 }
 EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
  return cl::sycl::fmin(x,y);
 }
 EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
  return cl::sycl::fmax(x,y);
 }
 EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
  return cl::sycl::fmin(x,y);
 }
 EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
  return cl::sycl::fmax(x,y);
 }
 #else
 template<typename T>
 EIGEN_DEVICE_FUNC
@ -981,6 +905,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
  return fminf(x, y);
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
  return fmin(x, y);
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
 {
 #if defined(EIGEN_HIPCC)
  // no "fminl" on HIP yet
  return (x < y) ? x : y;
 #else
  return fminl(x, y);
 #endif
 }
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@ -993,7 +935,93 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
  return fmaxf(x, y);
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
  return fmax(x, y);
 }
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
 {
 #if defined(EIGEN_HIPCC)
  // no "fmaxl" on HIP yet
  return (x > y) ? x : y;
 #else
  return fmaxl(x, y);
 #endif
 }
 #endif
 #if defined(__SYCL_DEVICE_ONLY__)
 #define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
 #define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
 #define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
 #define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
 #define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
 #define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
 #define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
 #define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
 #define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) 
 #define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
 template<>                                               \
  EIGEN_DEVICE_FUNC                                      \
  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
    return cl::sycl::FUNC(x);                            \
  }
 #define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
 #define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
  template<>                                                                  \
  EIGEN_DEVICE_FUNC                                                           \
  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
    return cl::sycl::FUNC(x, y);                                              \
  }
 #define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
 #define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
 SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
 SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
 SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
 SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename Scalar>
@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }
 EIGEN_DEVICE_FUNC
 inline bool abs2(bool x) { return x; }
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 #if defined(__SYCL_DEVICE_ONLY__)
  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   log1p(float x) { return cl::sycl::log1p(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-EIGEN_ALWAYS_INLINE double  log1p(double x) { return cl::sycl::log1p(x); }
+#endif //defined(__SYCL_DEVICE_ONLY__)
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
@ -1101,8 +1135,7 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   pow(float x, float y) { return cl::sycl::pow(x, y); }
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
 EIGEN_ALWAYS_INLINE double  pow(double x, double y) { return cl::sycl::pow(x, y); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
@ -1110,12 +1143,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return inte
 template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   isnan(float x) { return cl::sycl::isnan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-EIGEN_ALWAYS_INLINE double  isnan(double x) { return cl::sycl::isnan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-EIGEN_ALWAYS_INLINE float   isinf(float x) { return cl::sycl::isinf(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
 EIGEN_ALWAYS_INLINE double  isinf(double x) { return cl::sycl::isinf(x); }
 EIGEN_ALWAYS_INLINE float   isfinite(float x) { return cl::sycl::isfinite(x); }
 EIGEN_ALWAYS_INLINE double  isfinite(double x) { return cl::sycl::isfinite(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename Scalar>
@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   round(float x) { return cl::sycl::round(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
 EIGEN_ALWAYS_INLINE double  round(double x) { return cl::sycl::round(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
@ -1139,11 +1168,10 @@ T (floor)(const T& x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   floor(float x) { return cl::sycl::floor(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
 EIGEN_ALWAYS_INLINE double  floor(double x) { return cl::sycl::floor(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
@ -1160,11 +1188,10 @@ T (ceil)(const T& x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   ceil(float x) { return cl::sycl::ceil(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
 EIGEN_ALWAYS_INLINE double  ceil(double x) { return cl::sycl::ceil(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
@ -1205,8 +1232,7 @@ T sqrt(const T &x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sqrt(float x) { return cl::sycl::sqrt(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
 EIGEN_ALWAYS_INLINE double  sqrt(double x) { return cl::sycl::sqrt(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
 template<typename T>
@ -1217,12 +1243,11 @@ T log(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   log(float x) { return cl::sycl::log(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
 EIGEN_ALWAYS_INLINE double  log(double x) { return cl::sycl::log(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-typename NumTraits<T>::Real abs(const T &x) {
+typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
 abs(const T &x) {
  EIGEN_USING_STD_MATH(abs);
  return abs(x);
 }
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
 abs(const T &x) {
  return x;
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
@ -1268,16 +1301,31 @@ T exp(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   exp(float x) { return cl::sycl::exp(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
 EIGEN_ALWAYS_INLINE double  exp(double x) { return cl::sycl::exp(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 std::complex<float> exp(const std::complex<float>& x) {
  float com = ::expf(x.real());
  float res_real = com * ::cosf(x.imag());
  float res_imag = com * ::sinf(x.imag());
  return std::complex<float>(res_real, res_imag);
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 std::complex<double> exp(const std::complex<double>& x) {
  double com = ::exp(x.real());
  double res_real = com * ::cos(x.imag());
  double res_imag = com * ::sin(x.imag());
  return std::complex<double>(res_real, res_imag);
 }
 #endif
 template<typename Scalar>
@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   expm1(float x) { return cl::sycl::expm1(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
 EIGEN_ALWAYS_INLINE double  expm1(double x) { return cl::sycl::expm1(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float expm1(const float &x) { return ::expm1f(x); }
@ -1308,11 +1355,10 @@ T cos(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   cos(float x) { return cl::sycl::cos(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
 EIGEN_ALWAYS_INLINE double  cos(double x) { return cl::sycl::cos(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
@ -1328,11 +1374,10 @@ T sin(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sin(float x) { return cl::sycl::sin(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
 EIGEN_ALWAYS_INLINE double  sin(double x) { return cl::sycl::sin(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
@ -1348,11 +1393,10 @@ T tan(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   tan(float x) { return cl::sycl::tan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
 EIGEN_ALWAYS_INLINE double  tan(double x) { return cl::sycl::tan(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
@ -1367,12 +1411,21 @@ T acos(const T &x) {
  return acos(x);
 }
 #if EIGEN_HAS_CXX11_MATH
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T acosh(const T &x) {
  EIGEN_USING_STD_MATH(acosh);
  return acosh(x);
 }
 #endif
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   acos(float x) { return cl::sycl::acos(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-EIGEN_ALWAYS_INLINE double  acos(double x) { return cl::sycl::acos(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
@ -1387,12 +1440,21 @@ T asin(const T &x) {
  return asin(x);
 }
 #if EIGEN_HAS_CXX11_MATH
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T asinh(const T &x) {
  EIGEN_USING_STD_MATH(asinh);
  return asinh(x);
 }
 #endif
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   asin(float x) { return cl::sycl::asin(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-EIGEN_ALWAYS_INLINE double  asin(double x) { return cl::sycl::asin(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
@ -1407,12 +1469,21 @@ T atan(const T &x) {
  return atan(x);
 }
 #if EIGEN_HAS_CXX11_MATH
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T atanh(const T &x) {
  EIGEN_USING_STD_MATH(atanh);
  return atanh(x);
 }
 #endif
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   atan(float x) { return cl::sycl::atan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-EIGEN_ALWAYS_INLINE double  atan(double x) { return cl::sycl::atan(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
@ -1429,11 +1500,10 @@ T cosh(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   cosh(float x) { return cl::sycl::cosh(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
 EIGEN_ALWAYS_INLINE double  cosh(double x) { return cl::sycl::cosh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
@ -1449,11 +1519,10 @@ T sinh(const T &x) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   sinh(float x) { return cl::sycl::sinh(x); }
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
 EIGEN_ALWAYS_INLINE double  sinh(double x) { return cl::sycl::sinh(x); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
@ -1468,15 +1537,16 @@ T tanh(const T &x) {
  return tanh(x);
 }
-#if defined(__SYCL_DEVICE_ONLY__)
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__))
 EIGEN_ALWAYS_INLINE float   tanh(float x) { return cl::sycl::tanh(x); }
 EIGEN_ALWAYS_INLINE double  tanh(double x) { return cl::sycl::tanh(x); }
 #elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
-#ifdef __CUDACC__
+#if defined(__SYCL_DEVICE_ONLY__)
 SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
 #endif // defined(__SYCL_DEVICE_ONLY__)
 #if defined(EIGEN_GPUCC)
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) {
 }
 #if defined(__SYCL_DEVICE_ONLY__)
-EIGEN_ALWAYS_INLINE float   fmod(float x, float y) { return cl::sycl::fmod(x, y); }
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
 EIGEN_ALWAYS_INLINE double  fmod(double x, double y) { return cl::sycl::fmod(x, y); }
 #endif // defined(__SYCL_DEVICE_ONLY__)
-#ifdef __CUDACC__
+#if defined(EIGEN_GPUCC)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) {
 }
 #endif
 #if defined(__SYCL_DEVICE_ONLY__)
 #undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
 #undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
 #undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
 #undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
 #undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
 #undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
 #undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
 #undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
 #undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
 #undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
 #undef SYCL_SPECIALIZE_UNARY_FUNC
 #undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
 #undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
 #undef SYCL_SPECIALIZE_BINARY_FUNC
 #endif // defined(__SYCL_DEVICE_ONLY__)
 } // end namespace numext
 namespace internal {
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
  return pdiv(p, q);
 }
 template<typename RealScalar>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
  EIGEN_USING_STD_MATH(sqrt);
  RealScalar p, qp;
  p = numext::maxi(x,y);
  if(p==RealScalar(0)) return RealScalar(0);
  qp = numext::mini(y,x) / p;    
  return p * sqrt(RealScalar(1) + qp*qp);
 }
 template<typename Scalar>
 struct hypot_impl
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  static EIGEN_DEVICE_FUNC
  inline RealScalar run(const Scalar& x, const Scalar& y)
  {
    EIGEN_USING_STD_MATH(abs);
    return positive_real_hypot<RealScalar>(abs(x), abs(y));
  }
 };
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    Derived& operator-=(const MatrixBase<OtherDerived>& other);
 #ifdef __CUDACC__
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    const Product<Derived,OtherDerived,LazyProduct>
    operator*(const MatrixBase<OtherDerived> &other) const
    { return this->lazyProduct(other); }
 #else
    template<typename OtherDerived>
    const Product<Derived,OtherDerived>
    operator*(const MatrixBase<OtherDerived> &other) const;
 #endif
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
    const Product<Derived,OtherDerived,LazyProduct>
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
    Derived& setIdentity();
    EIGEN_DEVICE_FUNC
    Derived& setIdentity(Index rows, Index cols);
    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@ -294,7 +287,7 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator!= */
    template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
    { return cwiseEqual(other).all(); }
    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@ -302,10 +295,10 @@ template<typename Derived> class MatrixBase
      *          fuzzy comparison such as isApprox()
      * \sa isApprox(), operator== */
    template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
+    EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
    { return cwiseNotEqual(other).any(); }
-    NoAlias<Derived,Eigen::MatrixBase > noalias();
+    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
    // TODO forceAlignedAccess is temporarily disabled
    // Need to find a nicer workaround.
@ -335,6 +328,7 @@ template<typename Derived> class MatrixBase
    inline const PartialPivLU<PlainObject> lu() const;
    EIGEN_DEVICE_FUNC
    inline const Inverse<Derived> inverse() const;
    template<typename ResultType>
@ -344,12 +338,15 @@ template<typename Derived> class MatrixBase
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
    template<typename ResultType>
    inline void computeInverseWithCheck(
      ResultType& inverse,
      bool& invertible,
      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
    ) const;
    EIGEN_DEVICE_FUNC
    Scalar determinant() const;
 /////////// Cholesky module ///////////
@ -421,15 +418,19 @@ template<typename Derived> class MatrixBase
 ////////// Householder module ///////////
    EIGEN_DEVICE_FUNC
    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
    template<typename EssentialPart>
    EIGEN_DEVICE_FUNC
    void makeHouseholder(EssentialPart& essential,
                         Scalar& tau, RealScalar& beta) const;
    template<typename EssentialPart>
    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                   const Scalar& tau,
                                   Scalar* workspace);
    template<typename EssentialPart>
    EIGEN_DEVICE_FUNC
    void applyHouseholderOnTheRight(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
@ -437,8 +438,10 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////
    template<typename OtherScalar>
    EIGEN_DEVICE_FUNC
    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
    template<typename OtherScalar>
    EIGEN_DEVICE_FUNC
    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 ///////// SparseCore module /////////
--- a/Eigen/src/Core/NestByValue.h
+++ b/Eigen/src/Core/NestByValue.h
@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
    }
    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
    {
      return m_expression.template packet<LoadMode>(row, col);
    }
    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
    {
      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
    }
    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
    {
      return m_expression.template packet<LoadMode>(index);
    }
    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
    {
      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
    }
@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
  */
 template<typename Derived>
-inline const NestByValue<Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
  return NestByValue<Derived>(derived());
--- a/Eigen/src/Core/NoAlias.h
+++ b/Eigen/src/Core/NoAlias.h
@ -33,6 +33,7 @@ class NoAlias
  public:
    typedef typename ExpressionType::Scalar Scalar;
    EIGEN_DEVICE_FUNC
    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
    template<typename OtherDerived>
@ -74,10 +75,10 @@ class NoAlias
  *
  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
  * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
+  * expressions have this flag. Therefore, noalias() is only useful when
  * the source expression contains a matrix product.
  *
-  * Here are some examples where noalias is usefull:
+  * Here are some examples where noalias is useful:
  * \code
  * D.noalias()  = A * B;
  * D.noalias() += A.transpose() * B;
@ -98,7 +99,7 @@ class NoAlias
  * \sa class NoAlias
  */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
 {
  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@ -21,12 +21,14 @@ template< typename T,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
  EIGEN_DEVICE_FUNC
  static int run() { return std::numeric_limits<T>::digits10; }
 };
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
  EIGEN_DEVICE_FUNC
  static int run() {
    using std::log10;
    using std::ceil;
@ -38,6 +40,38 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
  EIGEN_DEVICE_FUNC
  static int run() { return 0; }
 };
 // default implementation of digits(), based on numeric_limits if specialized,
 // 0 for integer types, and log2(epsilon()) otherwise.
 template< typename T,
          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
          bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits_impl
 {
  EIGEN_DEVICE_FUNC
  static int run() { return std::numeric_limits<T>::digits; }
 };
 template<typename T>
 struct default_digits_impl<T,false,false> // Floating point
 {
  EIGEN_DEVICE_FUNC
  static int run() {
    using std::log;
    using std::ceil;
    typedef typename NumTraits<T>::Real Real;
    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
  }
 };
 template<typename T>
 struct default_digits_impl<T,false,true> // Integer
 {
  EIGEN_DEVICE_FUNC
  static int run() { return 0; }
 };
@ -118,6 +152,12 @@ template<typename T> struct GenericNumTraits
    return internal::default_digits10_impl<T>::run();
  }
  EIGEN_DEVICE_FUNC
  static inline int digits()
  {
    return internal::default_digits_impl<T>::run();
  }
  EIGEN_DEVICE_FUNC
  static inline Real dummy_precision()
  {
@ -215,6 +255,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
  EIGEN_DEVICE_FUNC
  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
  static inline int digits10() { return NumTraits<Scalar>::digits10(); }
 };
 template<> struct NumTraits<std::string>
--- a/Eigen/src/Core/PermutationMatrix.h
+++ b/Eigen/src/Core/PermutationMatrix.h
@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
    #endif
    /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
    /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
      * \a data pointers.
      *
      * Here is an example using strides:
      * \include Matrix_Map_stride.cpp
      * Output: \verbinclude Matrix_Map_stride.out
      *
      * \see class Map
      */
    //@{
@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
      resize(size);
    }
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
    template<typename T>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@ -917,13 +921,19 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
  #if EIGEN_HAS_TYPE_TRAITS
  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
  #else
  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
  #endif
  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
  {
    if (_this.rows() == rows && _this.cols() == cols) return;
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
+    if ( IsRelocatable
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
+          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
    {
      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@ -951,8 +961,9 @@ struct conservative_resize_like_impl
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
    EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
+    if ( IsRelocatable &&
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
    {
      const Index new_rows = other.rows() - _this.rows();
      const Index new_cols = other.cols() - _this.cols();
@ -980,13 +991,18 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
  : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
  using Base::run;
  using Base::IsRelocatable;
  static void run(DenseBase<Derived>& _this, Index size)
  {
    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
+    if(IsRelocatable)
      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
    else
      Base::run(_this.derived(), new_rows, new_cols);
  }
  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@ -997,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    if(IsRelocatable)
      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
    else
      Base::run(_this.derived(), new_rows, new_cols);
    if (num_new_elements > 0)
      _this.tail(num_new_elements) = other.tail(num_new_elements);
--- a/Eigen/src/Core/Product.h
+++ b/Eigen/src/Core/Product.h
@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
    }
-    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@ -116,7 +116,7 @@ class dense_product_base
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};
-/** Convertion to scalar for inner-products */
+/** Conversion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
 : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@ -127,7 +127,7 @@ public:
  using Base::derived;
  typedef typename Base::Scalar Scalar;
-  operator const Scalar() const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
  {
    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
  }
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
  public:
-    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
      return internal::evaluator<Derived>(derived()).coeff(row,col);
    }
-    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
    {
      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@ -20,7 +20,7 @@ namespace internal {
 /** \internal
  * Evaluator of a product expression.
  * Since products require special treatments to handle all possible cases,
-  * we simply deffer the evaluation logic to a product_evaluator class
+  * we simply defer the evaluation logic to a product_evaluator class
  * which offers more partial specialization possibilities.
  * 
  * \sa class product_evaluator
@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
  typedef Product<Lhs, Rhs, Options> XprType;
  typedef product_evaluator<XprType> Base;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                               const Product<Lhs, Rhs, DefaultProduct> > XprType;
  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
  {}
 };
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
        xpr.index() ))
@ -128,7 +128,7 @@ protected:
  PlainObject m_result;
 };
-// The following three shortcuts are enabled only if the scalar types match excatly.
+// The following three shortcuts are enabled only if the scalar types match exactly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 // Dense = Product
@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
  {
    Index dstRows = src.rows();
@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
  typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
  {
    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                        const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
  {
    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@ -207,11 +207,17 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
  static const bool value = true;
 };
 template<typename OtherXpr, typename Lhs, typename Rhs>
 struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
                                               const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
  static const bool value = true;
 };
 template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
 struct assignment_from_xpr_op_product
 {
  template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_STRONG_INLINE
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
  {
    call_assignment_no_alias(dst, src.lhs(), Func1());
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
  }
  template<typename Dst>
-  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };
@ -263,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
  evaluator<Rhs> rhsEval(rhs);
-  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
  // FIXME not very good if rhs is real and lhs complex while alpha is real too
  const Index cols = dst.cols();
@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
  evaluator<Lhs> lhsEval(lhs);
-  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
  // FIXME not very good if lhs is real and rhs complex while alpha is real too
  const Index rows = dst.rows();
@ -294,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
  struct adds {
    Scalar m_scale;
    explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
      dst.const_cast_derived() += m_scale * src;
    }
  };
  template<typename Dst>
-  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
  }
  template<typename Dst>
-  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
  }
  template<typename Dst>
-  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
  }
  template<typename Dst>
-  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
  }
@ -339,19 +345,19 @@ struct generic_product_impl_base
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 };
@ -367,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
  typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
  template<typename Dest>
-  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    LhsNested actual_lhs(lhs);
    RhsNested actual_rhs(rhs);
@ -384,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
    // but easier on the compiler side
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() += lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
  }
  template<typename Dst>
-  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    // dst.noalias() -= lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
  //    dst {,+,-}= s * (A.lazyProduct(B))
  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
  // For them, this strategy is also faster than simply by-passing the heap allocation through
  // stack allocation.
  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
  {
    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
  }
  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
  // overload more specialized.
  template<typename Dst, typename LhsT, typename Func>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
  {
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
  }
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@ -735,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
  template<typename Dest>
-  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_DEVICE_FUNC
  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
  {
    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
  }
@ -779,7 +812,11 @@ public:
    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
-    Alignment = evaluator<MatrixType>::Alignment
+    Alignment = evaluator<MatrixType>::Alignment,
    AsScalarProduct =     (DiagonalType::SizeAtCompileTime==1)
                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
                      ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
  };
  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@ -791,7 +828,10 @@ public:
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
  {
-    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+    if(AsScalarProduct)
      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
    else
      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
  }
 protected:
@ -845,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
  }
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
@ -889,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
  }
-#ifndef __CUDACC__
+#ifndef EIGEN_GPUCC
  template<int LoadMode,typename PacketType>
  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
  {
--- a/Eigen/src/Core/Random.h
+++ b/Eigen/src/Core/Random.h
@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
  */
 template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
 {
  return *this = Random(rows(), cols());
 }
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@ -23,22 +23,22 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
  enum {
    PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
+    InnerMaxSize = int(Evaluator::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
+                 ? Evaluator::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+                 : Evaluator::MaxRowsAtCompileTime
  };
  enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
                  && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
  };
@ -51,8 +51,8 @@ public:
 public:
  enum {
-    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
-         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
    UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
  };
@ -64,9 +64,9 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
  static void debug()
  {
-    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
    std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Derived::Flags)
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
    std::cerr.unsetf(std::ios::hex);
    EIGEN_DEBUG_VAR(InnerMaxSize)
    EIGEN_DEBUG_VAR(PacketSize)
@ -87,88 +87,88 @@ public:
 /*** no vectorization ***/
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_novec_unroller
 {
  enum {
    HalfLength = Length/2
  };
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
  {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
  }
 };
-template<typename Func, typename Derived, int Start>
+template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
+struct redux_novec_unroller<Func, Evaluator, Start, 1>
 {
  enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
+    outer = Start / Evaluator::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
+    inner = Start % Evaluator::InnerSizeAtCompileTime
  };
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
  {
-    return mat.coeffByOuterInner(outer, inner);
+    return eval.coeffByOuterInner(outer, inner);
  }
 };
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
+template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
+struct redux_novec_unroller<Func, Evaluator, Start, 0>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
  EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 /*** vectorization ***/
-template<typename Func, typename Derived, int Start, int Length>
+template<typename Func, typename Evaluator, int Start, int Length>
 struct redux_vec_unroller
 {
  enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
    HalfLength = Length/2
  };
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
  {
    return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
  }
 };
-template<typename Func, typename Derived, int Start>
+template<typename Func, typename Evaluator, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
+struct redux_vec_unroller<Func, Evaluator, Start, 1>
 {
  enum {
-    index = Start * redux_traits<Func, Derived>::PacketSize,
+    index = Start * redux_traits<Func, Evaluator>::PacketSize,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
+    outer = index / int(Evaluator::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
+    inner = index % int(Evaluator::InnerSizeAtCompileTime),
-    alignment = Derived::Alignment
+    alignment = Evaluator::Alignment
  };
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
+  static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
  {
-    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
+    return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
  }
 };
@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
-template<typename Func, typename Derived,
+template<typename Func, typename Evaluator,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
+         int Traversal = redux_traits<Func, Evaluator>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
+         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
 >
 struct redux_impl;
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
-  EIGEN_DEVICE_FUNC
+
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
    Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
+    res = eval.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
+    for(Index i = 1; i < xpr.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
+      res = func(res, eval.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
+    for(Index i = 1; i < xpr.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
+      for(Index j = 0; j < xpr.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+        res = func(res, eval.coeffByOuterInner(i, j));
    return res;
  }
 };
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
+struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
 {};
 template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename Evaluator::Scalar Scalar;
-
+  template<typename XprType>
-  static Scalar run(const Derived &mat, const Func& func)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
  {
-    const Index size = mat.size();
+    return Base::run(eval,func);
  }
 };
 template<typename Func, typename Evaluator>
 struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 {
  typedef typename Evaluator::Scalar Scalar;
  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
  template<typename XprType>
  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
  {
    const Index size = xpr.size();
-    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
+    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
    enum {
-      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
    };
-    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
+    const Index alignedStart = internal::first_default_aligned(xpr);
    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
    const Index alignedEnd2 = alignedStart + alignedSize2;
@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
    Scalar res;
    if(alignedSize)
    {
-      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
      {
-        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
        {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
        }
        packet_res0 = func.packetOp(packet_res0,packet_res1);
        if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
      }
      res = func.predux(packet_res0);
      for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
      for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = mat.coeff(0);
+      res = eval.coeff(0);
      for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+        res = func(res,eval.coeff(index));
    }
    return res;
@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
+template<typename Func, typename Evaluator, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
-  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
+  template<typename XprType>
  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
+    const Index innerSize = xpr.innerSize();
-    const Index outerSize = mat.outerSize();
+    const Index outerSize = xpr.outerSize();
    enum {
-      packetSize = redux_traits<Func, Derived>::PacketSize
+      packetSize = redux_traits<Func, Evaluator>::PacketSize
    };
    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
    Scalar res;
    if(packetedInnerSize)
    {
-      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
      res = func.predux(packet_res);
      for(Index j=0; j<outerSize; ++j)
        for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
+          res = func(res, eval.coeffByOuterInner(j,i));
    }
    else // too small to vectorize anything.
         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
    {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
    }
    return res;
  }
 };
-template<typename Func, typename Derived>
+template<typename Func, typename Evaluator>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
  enum {
-    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
-    Size = Derived::SizeAtCompileTime,
+    Size = Evaluator::SizeAtCompileTime,
    VectorizedSize = (Size / PacketSize) * PacketSize
  };
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
+
  template<typename XprType>
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
    if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
      if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
      return res;
    }
    else {
-      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
    }
  }
 };
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator
+class redux_evaluator : public internal::evaluator<_XprType>
 {
  typedef internal::evaluator<_XprType> Base;
 public:
  typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketScalar PacketScalar;
  typedef typename XprType::PacketReturnType PacketReturnType;
  enum {
    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    Flags = Base::Flags & ~DirectAccessBit,
    IsRowMajor = XprType::IsRowMajor,
    SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
    Alignment = evaluator<XprType>::Alignment
  };
  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeff(Index row, Index col) const
  { return m_evaluator.coeff(row, col); }
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeff(Index index) const
  { return m_evaluator.coeff(index); }
  template<int LoadMode, typename PacketType>
  PacketType packet(Index row, Index col) const
  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
  template<int LoadMode, typename PacketType>
  PacketType packet(Index index) const
  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
  EIGEN_DEVICE_FUNC
  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  template<int LoadMode, typename PacketType>
  PacketType packetByOuterInner(Index outer, Index inner) const
-  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
  const XprType & nestedExpression() const { return m_xpr; }
 protected:
  internal::evaluator<XprType> m_evaluator;
  const XprType &m_xpr;
 };
 } // end namespace internal
@ -407,7 +395,7 @@ protected:
  */
 template<typename Derived>
 template<typename Func>
-typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@ -415,14 +403,16 @@ DenseBase<Derived>::redux(const Func& func) const
  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
  ThisEvaluator thisEval(derived());
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
+  // The initial expression is passed to the reducer as an additional argument instead of
  // passing it as a member of redux_evaluator to help  
  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 /** \returns the minimum of all coefficients of \c *this.
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@ -432,7 +422,7 @@ DenseBase<Derived>::minCoeff() const
  * \warning the result is undefined if \c *this contains NaN.
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
  return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@ -445,7 +435,7 @@ DenseBase<Derived>::maxCoeff() const
  * \sa trace(), prod(), mean()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -458,7 +448,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@ -479,7 +469,7 @@ DenseBase<Derived>::mean() const
  * \sa sum(), mean(), trace()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -494,7 +484,7 @@ DenseBase<Derived>::prod() const
  * \sa diagonal(), sum()
  */
 template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
  return derived().diagonal().sum();
--- a/Eigen/src/Core/Ref.h
+++ b/Eigen/src/Core/Ref.h
@ -95,6 +95,8 @@ protected:
  template<typename Expression>
  EIGEN_DEVICE_FUNC void construct(Expression& expr)
  {
    EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
    if(PlainObjectType::RowsAtCompileTime==1)
    {
      eigen_assert(expr.rows()==1 || expr.cols()==1);
--- a/Eigen/src/Core/Replicate.h
+++ b/Eigen/src/Core/Replicate.h
@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
  */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
  return Replicate<Derived,RowFactor,ColFactor>(derived());
@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
  */
 template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
--- a/Eigen/src/Core/ReturnByValue.h
+++ b/Eigen/src/Core/ReturnByValue.h
@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
 template<typename Derived>
 template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
  other.evalTo(derived());
  return derived();
--- a/Eigen/src/Core/Reverse.h
+++ b/Eigen/src/Core/Reverse.h
@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
  *
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
  return ReverseReturnType(derived());
@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
  *
  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
 {
  if(cols()>rows())
  {
@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
  *
  * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
    EIGEN_DEVICE_FUNC
    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
+    {
      EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
    }
    EIGEN_DEVICE_FUNC
    inline Index rows() const { return m_matrix.rows(); }
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
                                   TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
    }
-    typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
    /** \sa MatrixBase::conjugate() const */
    EIGEN_DEVICE_FUNC
    inline const ConjugateReturnType conjugate() const
@ -322,7 +324,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@ -339,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
  */
 template<typename Derived>
 template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h
@ -15,33 +15,29 @@ namespace Eigen {
 // TODO generalize the scalar type of 'other'
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
  return derived();
 }
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
  return derived();
 }
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
  return derived();
 }
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
  typedef typename Derived::PlainObject PlainObject;
  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
  return derived();
 }
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h
@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
 template<typename Decomposition, typename RhsType>
 struct solve_traits<Decomposition,RhsType,Dense>
 {
-  typedef Matrix<typename RhsType::Scalar,
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar,
                 Decomposition::ColsAtCompileTime,
                 RhsType::ColsAtCompileTime,
                 RhsType::PlainObject::Options,
                 Decomposition::MaxColsAtCompileTime,
-                 RhsType::MaxColsAtCompileTime> PlainObject;  
+                 RhsType::MaxColsAtCompileTime>::type PlainObject;
 };
 template<typename Decomposition, typename RhsType>
@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
  }
 };
-} // end namepsace internal
+} // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
  OtherDerived& other = _other.const_cast_derived();
  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
    };
    /** Default constructor */
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
    ssq += (bl*invScale).squaredNorm();
 }
 template<typename VectorType, typename RealScalar>
 void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
 {
  typedef typename VectorType::Scalar Scalar;
  const Index blockSize = 4096;
  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
  const VectorTypeCopy copy(vec);
  enum {
    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
  };
  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
  Index n = vec.size();
  Index bi = internal::first_default_aligned(copy);
  if (bi>0)
    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
 }
 template<typename VectorType>
 typename VectorType::RealScalar
 stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
 {
  using std::sqrt;
  using std::abs;
  Index n = vec.size();
  if(n==1)
    return abs(vec.coeff(0));
  typedef typename VectorType::RealScalar RealScalar;
  RealScalar scale(0);
  RealScalar invScale(1);
  RealScalar ssq(0); // sum of squares
  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
  return scale * sqrt(ssq);
 }
 template<typename MatrixType>
 typename MatrixType::RealScalar
 stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
 {
  using std::sqrt;
  typedef typename MatrixType::RealScalar RealScalar;
  RealScalar scale(0);
  RealScalar invScale(1);
  RealScalar ssq(0); // sum of squares
  for(Index j=0; j<mat.outerSize(); ++j)
    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
    // are used. For any specific computer, each of the assignment
    // statements can be replaced
    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
+    it    = NumTraits<RealScalar>::digits();                        // number of base-beta digits in mantissa
    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
  RealScalar asml = RealScalar(0);
  RealScalar amed = RealScalar(0);
  RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
+
  for(Index j=0; j<vec.outerSize(); ++j)
  {
-    RealScalar ax = abs(it.value());
+    for(typename Derived::InnerIterator it(vec, j); it; ++it)
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
+    {
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
+      RealScalar ax = abs(it.value());
-    else             amed += numext::abs2(ax);
+      if(ax > ab2)     abig += numext::abs2(ax*s2m);
      else if(ax < b1) asml += numext::abs2(ax*s1m);
      else             amed += numext::abs2(ax);
    }
  }
  if(amed!=amed)
    return amed;  // we got a NaN
@ -156,35 +225,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::sqrt;
+  return internal::stable_norm_impl(derived());
  using std::abs;
  const Index blockSize = 4096;
  RealScalar scale(0);
  RealScalar invScale(1);
  RealScalar ssq(0); // sum of square
  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
  DerivedCopy copy(derived());
  enum {
    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
  };
  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
  Index n = size();
  if(n==1)
    return abs(this->coeff(0));
  Index bi = internal::first_default_aligned(copy);
  if (bi>0)
    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
  for (; bi<n; bi+=blockSize)
    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
  return scale * sqrt(ssq);
 }
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@ -212,7 +253,10 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  if(size()==1)
    return numext::abs(coeff(0,0));
  else
    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 } // end namespace Eigen
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
    nestedExpression() { return m_matrix; }
    /** \internal */
    EIGEN_DEVICE_FUNC
    void resize(Index nrows, Index ncols) {
      m_matrix.resize(ncols,nrows);
    }
@ -168,7 +169,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline Transpose<Derived>
+EIGEN_DEVICE_FUNC inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@ -180,7 +181,7 @@ DenseBase<Derived>::transpose()
  *
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@ -206,7 +207,7 @@ DenseBase<Derived>::transpose() const
  *
  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
  return AdjointReturnType(this->transpose());
@ -281,7 +282,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
  *
  * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
 {
  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
               && "transposeInPlace() called on a non-square non-resizable matrix");
@ -312,7 +313,7 @@ inline void DenseBase<Derived>::transposeInPlace()
  *
  * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
 {
  derived() = adjoint().eval();
 }
--- a/Eigen/src/Core/Transpositions.h
+++ b/Eigen/src/Core/Transpositions.h
@ -84,7 +84,7 @@ class TranspositionsBase
    }
    // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
+    // might be useful when the target matrix expression is complex, e.g.:
    // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
    /*
    template<typename MatrixType>
@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
    const Product<OtherDerived, Transpose, AliasFreeProduct>
    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
    {
-      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
    }
    /** \returns the \a matrix with the inverse transpositions applied to the rows.
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
    inline Index innerStride() const { return derived().innerStride(); }
    // dummy resize function
    EIGEN_DEVICE_FUNC
    void resize(Index rows, Index cols)
    {
      EIGEN_UNUSED_VARIABLE(rows);
@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
      * \a Side==OnTheRight.
      *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
      *
      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@ -488,7 +489,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \sa TriangularView::solveInPlace()
      */
    template<int Side, typename Other>
    EIGEN_DEVICE_FUNC
    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
    solve(const MatrixBase<Other>& other) const;
@ -497,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
      * This function will const_cast it, so constness isn't honored here.
      *
-      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
      *
      * See TriangularView:solve() for the details.
      */
@ -554,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@ -564,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@ -573,7 +573,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
@ -583,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
  eigen_assert(Mode == int(OtherDerived::Mode));
  internal::call_assignment_no_alias(derived(), other.derived());
@ -598,7 +598,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
  evalToLazy(other.derived());
 }
@ -624,6 +624,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
  */
 template<typename Derived>
 template<unsigned int Mode>
 EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@ -633,6 +634,7 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
 EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@ -715,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
  typedef TriangularView<MatrixType,Mode> XprType;
  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
  EIGEN_DEVICE_FUNC
  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
@ -930,7 +933,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
  * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
  other.derived().resize(this->rows(), this->cols());
  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
--- a/Eigen/src/Core/VectorBlock.h
+++ b/Eigen/src/Core/VectorBlock.h
@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
  * most of the time this is the only way it is used.
  *
-  * However, if you want to directly maniputate sub-vector expressions,
+  * However, if you want to directly manipulate sub-vector expressions,
  * for instance if you want to write a function returning such an expression, you
  * will need to use this class.
  *
--- a/Eigen/src/Core/VectorwiseOp.h
+++ b/Eigen/src/Core/VectorwiseOp.h
@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
  return ColwiseReturnType(derived());
@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
  */
 template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
  return RowwiseReturnType(derived());
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
  }
 };
-template<> struct conj_helper<Packet8f, Packet4cf, false,false>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
 };
 template<> struct conj_helper<Packet4cf, Packet8f, false,false>
 {
  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
  }
 };
-template<> struct conj_helper<Packet4d, Packet2cd, false,false>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cd, Packet4d, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
 }
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
 {
   __m256d tmp = _mm256_shuffle_pd(a,a,5);
  return _mm256_permute2f128_pd(tmp, tmp, 1);
-
+  #if 0
  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
    return _mm256_permute_pd(swap_halves,5);
  #endif
 }
 // pabs should be ok
@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
  return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
-template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
 {
  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
  //     x = x + x - 1.0;
  //   } else { x = x - 1.0; }
  __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
-  Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
+  Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
  x = psub(x, p16f_1);
-  e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
+  e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
  x = padd(x, tmp);
  Packet16f x2 = pmul(x, x);
@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
  x = padd(x, y2);
  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
-  return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
+  return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
+                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
                              p16f_minus_inf);
 }
 #endif
@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  __mmask16 denormal_mask = _mm512_kand(
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
+  Packet16f x = _mm512_rsqrt14_ps(_x);
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
                                     _mm512_setzero_ps());
  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
-  // Multiply the original _x by it's reciprocal square root to extract the
+  // Flush results for denormals to zero.
-  // square root.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
  return pmul(_x, x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
+  __mmask16 denormal_mask = _mm512_kand(
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
                        _CMP_LT_OQ),
      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d x = _mm512_rsqrt14_pd(_x);
-  // select only the inverse sqrt of positive normal inputs (denormals are
+  // Do a single step of Newton's iteration.
-  // flushed to zero and cause infs as well).
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
                                    _mm512_setzero_pd());
  // Do a first step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
-  // Multiply the original _x by it's reciprocal square root to extract the
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
  // square root.
  return pmul(_x, x);
 }
 #else
 template <>
@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
-  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
                                     _mm512_rsqrt14_ps(_x));
  // Fill in NaNs and Infs for the negative/zero entries.
  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
  Packet16f infs_and_nans = _mm512_mask_blend_ps(
-      neg_mask, p16f_nan,
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
      _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
  // Do a single step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }
 template <>
@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
  // select only the inverse sqrt of positive normal inputs (denormals are
  // flushed to zero and cause infs as well).
  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
-  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
                                    _mm512_rsqrt14_pd(_x));
  // Fill in NaNs and Infs for the negative/zero entries.
  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
  Packet8d infs_and_nans = _mm512_mask_blend_pd(
-      neg_mask, p8d_nan,
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
      _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
  // Do a first step of Newton's iteration.
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
  // Insert NaNs and Infs in all the right places.
-  return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
  return _mm512_rsqrt28_ps(x);
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@ -54,6 +54,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 16,
    HasHalfPacket = 1,
    HasBlend = 0,
 #if EIGEN_GNUC_AT_LEAST(5, 3)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
    HasLog = 1,
@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
  return pairs;
 }
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
  return x;
 }
 #else
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
  __m512d x = _mm512_setzero_pd();
  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
  return x;
 }
 #endif
 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
                                                             Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
                                                            Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@ -558,7 +572,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                         const Packet16f& from,
                                                         Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -568,7 +582,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
                                                         const Packet8d& from,
                                                         Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
  _mm512_i32scatter_pd(to, indices, from, 8);
@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
  pstore(to, pa);
 }
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
 template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
 {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
  // _mm512_abs_ps intrinsic not found, so hack around it
-  return (__m512d)_mm512_and_si512((__m512i)a,
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
-                                   _mm512_set1_epi64(0x7fffffffffffffff));
+                                   _mm512_set1_epi64(0x7fffffffffffffff)));
 }
 #ifdef EIGEN_VECTORIZE_AVX512DQ
@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
 #else
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
  final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
-  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
+  __m512d final_output = _mm512_castpd256_pd512(final_0);
  return _mm512_insertf64x4(final_output, final_1, 1);
 }
@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
 }
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
  __m256d res = _mm256_add_pd(lane0, lane1);
@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
  return Packet16f();
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
-                                    const Packet8d& /*thenPacket*/,
+                                    const Packet8d& thenPacket,
-                                    const Packet8d& /*elsePacket*/) {
+                                    const Packet8d& elsePacket) {
-  assert(false && "To be implemented");
+  __mmask8 m = (ifPacket.select[0]   )
-  return Packet8d();
+             | (ifPacket.select[1]<<1)
             | (ifPacket.select[2]<<2)
             | (ifPacket.select[3]<<3)
             | (ifPacket.select[4]<<4)
             | (ifPacket.select[5]<<5)
             | (ifPacket.select[6]<<6)
             | (ifPacket.select[7]<<7);
  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }
 template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
 {
  return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
 }
 template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
 {
  return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
 }
 template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
 {
  return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
 }
 template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
 {
  return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
 }
 } // end namespace internal
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
  { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
  { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
    return pconj(internal::pmul(a, b));
  }
 };
 template<> struct conj_helper<Packet2d, Packet1cd, false,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
  { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
 };
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
  { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4u
 static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
 #else
-static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
  #ifdef __VSX__
  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
  #else
  return vec_min(a, b);
  #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
  #ifdef __VSX__
  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
  #else
  return vec_max(a, b);
  #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
  EIGEN_DEBUG_UNALIGNED_LOAD
@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
 #else
-// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
  EIGEN_DEBUG_ALIGNED_STORE
@ -764,7 +784,7 @@ typedef __vector __bool long         Packet2bl;
 static Packet2l  p2l_ONE  = { 1, 1 };
 static Packet2l  p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2d  p2d_ONE  = { 1.0, 1.0 }; 
+static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
 static Packet2d  p2d_MZERO = { -0.0, -0.0 };
@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
  Packet2d ret;
  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
 }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
  Packet2d ret;
  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
  return ret;
 }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
  Packet2d v[2], sum;
  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
- 
+
 #ifdef _BIG_ENDIAN
  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
 #else
@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
  Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
+  Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
  return vec_sel(elsePacket, thenPacket, mask);
 }
 #endif // __VSX__
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@ -16,7 +16,7 @@ namespace Eigen {
 namespace internal {
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasMul
+    Vectorizable = packet_traits<std::complex<T> >::HasMul
  };
  typedef typename std::complex<T> result_type;
@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
  enum {
-    Vectorizable = packet_traits<std::complex<T>>::HasDiv
+    Vectorizable = packet_traits<std::complex<T> >::HasDiv
  };
  typedef typename std::complex<T> result_type;
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@ -0,0 +1,29 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                          \
  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
    { return padd(c, pmul(x,y)); }                                                                                \
    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
  };                                                                                                              \
                                                                                                                  \
  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
    { return padd(c, pmul(x,y)); }                                                                                \
    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
  };
 #endif // EIGEN_ARCH_CONJ_HELPER_H
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@ -13,7 +13,7 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted.
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@ -26,15 +26,15 @@
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
-#ifndef EIGEN_HALF_CUDA_H
+#ifndef EIGEN_HALF_GPU_H
-#define EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_GPU_H
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@ -49,39 +49,107 @@ struct half;
 namespace half_impl {
-#if !defined(EIGEN_HAS_CUDA_FP16)
+#if !defined(EIGEN_HAS_GPU_FP16)
-
+// Make our own __half_raw definition that is similar to CUDA's.
-// Make our own __half definition that is similar to CUDA's.
+struct __half_raw {
-struct __half {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
-  EIGEN_DEVICE_FUNC __half() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
  unsigned short x;
 };
 #elif defined(EIGEN_HAS_HIP_FP16)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
 // Make a __half_raw definition that is
 // ++ compatible with that of Eigen and
 // ++ add an implicit conversion to the native __half of the old HIP implementation.
 //
 // Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
 //
 // In the old HIP implementation,
 //   ++ __half is a typedef of __fp16
 //   ++ the "__h*" routines take "__half" arguments
 // so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make 
 // that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
 struct __half_raw {
  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
  union {
    unsigned short x;
    __half data;
  };
  operator __half(void) const { return data; }
 };
 #endif
 #elif defined(EIGEN_HAS_CUDA_FP16)
 #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
 typedef __half __half_raw;
 #endif // defined(EIGEN_HAS_CUDA_FP16)
 #elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
 typedef cl::sycl::half __half_raw;
 #endif
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-struct half_base : public __half {
+struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
  #if defined(EIGEN_HAS_OLD_HIP_FP16)
  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
  #else
  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
  #endif
 #elif defined(EIGEN_HAS_CUDA_FP16)
  #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
  #endif
 #endif    
 #endif
 };
 } // namespace half_impl
 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16)
+
-    typedef half_impl::__half __half;
+  // Writing this out as separate #if-else blocks to make the code easier to follow
-  #endif
+  // The same applies to most #if-else blocks in this file
 #if !defined(EIGEN_HAS_GPU_FP16)
  typedef half_impl::__half_raw __half_raw;
 #elif defined(EIGEN_HAS_HIP_FP16)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
  typedef half_impl::__half_raw __half_raw;
 #endif
 #elif defined(EIGEN_HAS_CUDA_FP16)
  // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
  // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
 #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000  
  typedef half_impl::__half_raw __half_raw;
 #endif
 #endif
  EIGEN_DEVICE_FUNC half() {}
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
 #if defined(EIGEN_HAS_GPU_FP16)
 #if defined(EIGEN_HAS_HIP_FP16)
  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
 #elif defined(EIGEN_HAS_CUDA_FP16)
  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
  #endif
 #endif
 #endif
  explicit EIGEN_DEVICE_FUNC half(bool b)
      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@ -136,72 +204,136 @@ struct half : public half_impl::half_base {
    x = other.x;
    return *this;
  }
 };
 } // end namespace Eigen
 namespace std {
 template<>
 struct numeric_limits<Eigen::half> {
  static const bool is_specialized = true;
  static const bool is_signed = true;
  static const bool is_integer = false;
  static const bool is_exact = false;
  static const bool has_infinity = true;
  static const bool has_quiet_NaN = true;
  static const bool has_signaling_NaN = true;
  static const float_denorm_style has_denorm = denorm_present;
  static const bool has_denorm_loss = false;
  static const std::float_round_style round_style = std::round_to_nearest;
  static const bool is_iec559 = false;
  static const bool is_bounded = false;
  static const bool is_modulo = false;
  static const int digits = 11;
  static const int digits10 = 3;      // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
  static const int max_digits10 = 5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
  static const int radix = 2;
  static const int min_exponent = -13;
  static const int min_exponent10 = -4;
  static const int max_exponent = 16;
  static const int max_exponent10 = 4;
  static const bool traps = true;
  static const bool tinyness_before = false;
  static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
  static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
  static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
  static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
  static Eigen::half round_error() { return Eigen::half(0.5); }
  static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
  static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
  static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
  static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
 };
 // If std::numeric_limits<T> is specialized, should also specialize
 // std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
 // std::numeric_limits<const volatile T>
 // https://stackoverflow.com/a/16519653/
 template<>
 struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
 template<>
 struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
 template<>
 struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
 } // end namespace std
 namespace Eigen {
 namespace half_impl {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 // Intrinsics for native fp16 support. Note that on current hardware,
 // these are no faster than fp32 arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
-__device__ half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
 #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
  return __hadd(::__half(a), ::__half(b));
 #else
  return __hadd(a, b);
 #endif
 }
-__device__ half operator * (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
  return __hmul(a, b);
 }
-__device__ half operator - (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
  return __hsub(a, b);
 }
-__device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
 #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
  return __hdiv(a, b);
 #else
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
 #endif
 }
-__device__ half operator - (const half& a) {
+EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
  return __hneg(a);
 }
-__device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
  a = a + b;
  return a;
 }
-__device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
  a = a * b;
  return a;
 }
-__device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
  a = a - b;
  return a;
 }
-__device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
  a = a / b;
  return a;
 }
-__device__ bool operator == (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
  return __heq(a, b);
 }
-__device__ bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
  return __hne(a, b);
 }
-__device__ bool operator < (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
  return __hlt(a, b);
 }
-__device__ bool operator <= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
  return __hle(a, b);
 }
-__device__ bool operator > (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
  return __hgt(a, b);
 }
-__device__ bool operator >= (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
  return __hge(a, b);
 }
 #else  // Emulate support for half floats
-// Definitions for CPUs and older CUDA, mostly working through conversion
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
  return a;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
-  return float(a) == float(b);
+  return numext::equal_strict(float(a),float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
-  return float(a) != float(b);
+  return numext::not_equal_strict(float(a), float(b));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
  return float(a) < float(b);
@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
-  __half h;
+  __half_raw h;
  h.x = x;
  return h;
 }
-union FP32 {
+union float32_bits {
  unsigned int u;
  float f;
 };
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  return __float2half(ff);
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  __half tmp_ff = __float2half(ff);
  return *(__half_raw*)&tmp_ff;
 #elif defined(EIGEN_HAS_FP16_C)
-  __half h;
+  __half_raw h;
  h.x = _cvtss_sh(ff, 0);
  return h;
 #else
-  FP32 f; f.f = ff;
+  float32_bits f; f.f = ff;
-  const FP32 f32infty = { 255 << 23 };
+  const float32_bits f32infty = { 255 << 23 };
-  const FP32 f16max = { (127 + 16) << 23 };
+  const float32_bits f16max = { (127 + 16) << 23 };
-  const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+  const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
  unsigned int sign_mask = 0x80000000u;
-  __half o;
+  __half_raw o;
  o.x = static_cast<unsigned short>(0x0u);
  unsigned int sign = f.u & sign_mask;
@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __half2float(h);
 #elif defined(EIGEN_HAS_FP16_C)
  return _cvtsh_ss(h.x);
 #else
-  const FP32 magic = { 113 << 23 };
+  const float32_bits magic = { 113 << 23 };
  const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
-  FP32 o;
+  float32_bits o;
  o.u = (h.x & 0x7fff) << 13;             // exponent/mantissa bits
  unsigned int exp = shifted_exp & o.u;   // just the exponent
@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
  return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __hisnan(a);
 #else
  return (a.x & 0x7fff) > 0x7c00;
@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
  return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp(a));
 #else
   return half(::expf(float(a)));
@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
  return half(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return half(::hlog(a));
 #else
  return half(::logf(float(a)));
@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
  return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hsqrt(a));
 #else
    return half(::sqrtf(float(a)));
@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
  return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __hlt(b, a) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __hlt(a, b) ? b : a;
 #else
  const float f1 = static_cast<float>(a);
@ -496,6 +639,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
 template<> struct NumTraits<Eigen::half>
    : GenericNumTraits<Eigen::half>
 {
  enum {
    IsSigned = true,
    IsInteger = false,
    IsComplex = false,
    RequireInitialization = false
  };
  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
    return half_impl::raw_uint16_to_half(0x0800);
  }
@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
  return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
  return Eigen::half(::hlog(a));
 #else
  return Eigen::half(::logf(float(a)));
@ -560,14 +711,22 @@ struct hash<Eigen::half> {
 // Add the missing shfl_xor intrinsic
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
  #if (EIGEN_CUDACC_VER < 90000) || \
    defined(EIGEN_HAS_HIP_FP16)
  return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
  #else
  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
  #endif
 }
 #endif
-// ldg() has an overload for __half, but we also need one for Eigen::half.
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
  return Eigen::half_impl::raw_uint16_to_half(
      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 namespace Eigen {
 namespace numext {
@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
-#endif // EIGEN_HALF_CUDA_H
+#endif // EIGEN_HALF_GPU_H
--- a/Eigen/src/Core/arch/CUDA/MathFunctions.h
+++ b/Eigen/src/Core/arch/CUDA/MathFunctions.h
@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
-#define EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
 namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
 } // end namespace Eigen
-#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H
--- a/Eigen/src/Core/arch/CUDA/PacketMath.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMath.h
@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_PACKET_MATH_CUDA_H
+#ifndef EIGEN_PACKET_MATH_GPU_H
-#define EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_GPU_H
 namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 template<> struct is_arithmetic<float4>  { enum { value = true }; };
 template<> struct is_arithmetic<double2> { enum { value = true }; };
@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
    HasI0e = 1,
    HasI1e = 1,
    HasIGamma = 1,
    HasIGammaDerA = 1,
    HasGammaSampleDerAlpha = 1,
    HasIGammac = 1,
    HasBetaInc = 1,
@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
    HasPolygamma = 1,
    HasErf = 1,
    HasErfc = 1,
    HasI0e = 1,
    HasI1e = 1,
    HasIGamma = 1,
    HasIGammaDerA = 1,
    HasGammaSampleDerAlpha = 1,
    HasIGammac = 1,
    HasBetaInc = 1,
@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
  return make_double2(from[0], from[1]);
 }
-template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
  return make_float4(from[0], from[0], from[1], from[1]);
 }
-template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
  return make_double2(from[0], from[0]);
 }
@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
  return __ldg((const float4*)from);
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
  return __ldg((const double2*)from);
 #else
  return make_double2(from[0], from[1]);
@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
  return make_double2(__ldg(from+0), __ldg(from+1));
 #else
  return make_double2(from[0], from[1]);
@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<float4,4>& kernel) {
-  double tmp = kernel.packet[0].y;
+  float tmp = kernel.packet[0].y;
  kernel.packet[0].y = kernel.packet[1].x;
  kernel.packet[1].x = tmp;
@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 } // end namespace Eigen
-#endif // EIGEN_PACKET_MATH_CUDA_H
+#endif // EIGEN_PACKET_MATH_GPU_H
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@ -7,15 +7,16 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
+#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
-#define EIGEN_PACKET_MATH_HALF_CUDA_H
+#define EIGEN_PACKET_MATH_HALF_GPU_H
 namespace Eigen {
 namespace internal {
 // Most of the following operations require arch >= 3.0
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
 template<> struct is_arithmetic<half2> { enum { value = true }; };
@ -42,70 +43,108 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
-template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
  return half2half2(from);
 #else  
  return __half2half2(from);
 #endif
 #else // EIGEN_CUDA_ARCH
  return __half2half2(from);
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
  return *reinterpret_cast<const half2*>(from);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
  return __halves2half2(from[0], from[1]);
 }
-template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
  return __halves2half2(from[0], from[0]);
 }
-template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
  *reinterpret_cast<half2*>(to) = from;
 }
-template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
  to[0] = __low2half(from);
  to[1] = __high2half(from);
 }
 template<>
- __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
  return __halves2half2((*(from+0)), (*(from+1)));
 #else
  return __ldg((const half2*)from);
 #endif
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 350
   return __ldg((const half2*)from);
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 #endif
 }
 template<>
-__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
-#if __CUDA_ARCH__ >= 350
+
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
  return __halves2half2((*(from+0)), (*(from+1)));
 #else
  return __halves2half2(__ldg(from+0), __ldg(from+1));
 #endif
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 350
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
  return __halves2half2(*(from+0), *(from+1));
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
  return __halves2half2(from[0*stride], from[1*stride]);
 }
-template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
  to[stride*0] = __low2half(from);
  to[stride*1] = __high2half(from);
 }
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
  return __low2half(a);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
  half2 result;
-  result.x = a.x & 0x7FFF7FFF;
+  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
  return result;
 }
-__device__ EIGEN_STRONG_INLINE void
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
  __half a1 = __low2half(kernel.packet[0]);
  __half a2 = __high2half(kernel.packet[0]);
@ -115,17 +154,31 @@ ptranspose(PacketBlock<half2,2>& kernel) {
  kernel.packet[1] = __halves2half2(a2, b2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
  float f = __half2float(a) + 1.0f;
  return __halves2half2(a, __float2half(f));
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hadd2(a, b);
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hadd2(a, b);
 #else
  float a1 = __low2float(a);
@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
  float r2 = a2 + b2;
  return __floats2half2_rn(r1, r2);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hsub2(a, b);
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hsub2(a, b);
 #else
  float a1 = __low2float(a);
@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
  float r2 = a2 - b2;
  return __floats2half2_rn(r1, r2);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hneg2(a);
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hneg2(a);
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  return __floats2half2_rn(-a1, -a2);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hmul2(a, b);
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hmul2(a, b);
 #else
  float a1 = __low2float(a);
@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
  float r2 = a2 * b2;
  return __floats2half2_rn(r1, r2);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
   return __hfma2(a, b, c);
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
   return __hfma2(a, b, c);
 #else
  float a1 = __low2float(a);
@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
  float r2 = a2 * b2 + c2;
  return __floats2half2_rn(r1, r2);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
 #if defined(EIGEN_HIP_DEVICE_COMPILE)
 #if defined(EIGEN_HAS_OLD_HIP_FP16)
  return h2div(a, b);
 #else
  return __h2div(a, b);
 #endif
 #else // EIGEN_CUDA_ARCH
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
  float r1 = a1 / b1;
  float r2 = a2 / b2;
  return __floats2half2_rn(r1, r2);
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float b1 = __low2float(b);
@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
  return __halves2half2(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hadd(__low2half(a), __high2half(a));
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hadd(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+  return Eigen::half(__float2half(a1 + a2));
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
  float a2 = __high2float(a);
  return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
  float a2 = __high2float(a);
  return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
-#if __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
  return __hmul(__low2half(a), __high2half(a));
 #else  // EIGEN_CUDA_ARCH
 #if EIGEN_CUDA_ARCH >= 530
  return __hmul(__low2half(a), __high2half(a));
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+  return Eigen::half(__float2half(a1 * a2));
 #endif
 #endif
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = log1pf(a1);
@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expm1f(a1);
@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }
-#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
  defined(EIGEN_HIP_DEVICE_COMPILE)
-template<>  __device__ EIGEN_STRONG_INLINE
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 plog<half2>(const half2& a) {
  return h2log(a);
 }
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 pexp<half2>(const half2& a) {
  return h2exp(a);
 }
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 psqrt<half2>(const half2& a) {
  return h2sqrt(a);
 }
-template<> __device__ EIGEN_STRONG_INLINE
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 half2 prsqrt<half2>(const half2& a) {
  return h2rsqrt(a);
 }
 #else
-template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = logf(a1);
@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = expf(a1);
@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = sqrtf(a1);
@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }
-template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
  float a1 = __low2float(a);
  float a2 = __high2float(a);
  float r1 = rsqrtf(a1);
@ -361,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
    AlignedOnScalar = 1,
    size = 16,
    HasHalfPacket = 0,
-    HasAdd    = 0,
+    HasAdd    = 1,
-    HasSub    = 0,
+    HasSub    = 1,
-    HasMul    = 0,
+    HasMul    = 1,
-    HasNegate = 0,
+    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
 }
 template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_store_si256((__m256i*)to, from.x);
+  // (void*) -> workaround clang warning:
  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
  _mm256_store_si256((__m256i*)(void*)to, from.x);
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
-  _mm256_storeu_si256((__m256i*)to, from.x);
+  // (void*) -> workaround clang warning:
  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
  _mm256_storeu_si256((__m256i*)(void*)to, from.x);
 }
 template<> EIGEN_STRONG_INLINE Packet16h
 ploaddup<Packet16h>(const Eigen::half*  from) {
  Packet16h result;
  unsigned short a = from[0].x;
  unsigned short b = from[1].x;
  unsigned short c = from[2].x;
  unsigned short d = from[3].x;
  unsigned short e = from[4].x;
  unsigned short f = from[5].x;
  unsigned short g = from[6].x;
  unsigned short h = from[7].x;
  result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet16h
@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
  // FIXME we could do that with bit manipulation
  Packet16f af = half2float(a);
  Packet16f rf = pnegate(af);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
  Packet16f af = half2float(a);
  Packet16f bf = half2float(b);
@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
  Packet16f af = half2float(a);
  Packet16f bf = half2float(b);
  Packet16f rf = psub(af, bf);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
  Packet16f af = half2float(a);
  Packet16f bf = half2float(b);
@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
  return half(predux(from_float));
 }
 template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
  Packet16f from_float = half2float(from);
  return half(predux_mul(from_float));
 }
 template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
  Packet16f pf[16];
  pf[0] = half2float(p[0]);
  pf[1] = half2float(p[1]);
  pf[2] = half2float(p[2]);
  pf[3] = half2float(p[3]);
  pf[4] = half2float(p[4]);
  pf[5] = half2float(p[5]);
  pf[6] = half2float(p[6]);
  pf[7] = half2float(p[7]);
  pf[8] = half2float(p[8]);
  pf[9] = half2float(p[9]);
  pf[10] = half2float(p[10]);
  pf[11] = half2float(p[11]);
  pf[12] = half2float(p[12]);
  pf[13] = half2float(p[13]);
  pf[14] = half2float(p[14]);
  pf[15] = half2float(p[15]);
  Packet16f reduced = preduxp<Packet16f>(pf);
  return float2half(reduced);
 }
 template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
 {
  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
  Packet16h res;
  res.x = _mm256_insertf128_si256(
                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
 {
  Packet16h res;
  res.x = _mm256_insert_epi16(a.x,b.x,0);
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
 {
  Packet16h res;
  res.x = _mm256_insert_epi16(a.x,b.x,15);
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
 {
  Packet16h result;
@ -611,20 +831,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
  kernel.packet[0].x = a_p_0;
@ -729,10 +949,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
    AlignedOnScalar = 1,
    size = 8,
    HasHalfPacket = 0,
-    HasAdd    = 0,
+    HasAdd    = 1,
-    HasSub    = 0,
+    HasSub    = 1,
-    HasMul    = 0,
+    HasMul    = 1,
-    HasNegate = 0,
+    HasNegate = 1,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@ -781,6 +1001,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
 }
 template<> EIGEN_STRONG_INLINE Packet8h
 ploaddup<Packet8h>(const Eigen::half*  from) {
  Packet8h result;
  unsigned short a = from[0].x;
  unsigned short b = from[1].x;
  unsigned short c = from[2].x;
  unsigned short d = from[3].x;
  result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet8h
 ploadquad<Packet8h>(const Eigen::half* from) {
  Packet8h result;
@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
  // FIXME we could do that with bit manipulation
  Packet8f af = half2float(a);
  Packet8f rf = pnegate(af);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
  Packet8f af = half2float(a);
  Packet8f bf = half2float(b);
@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
  Packet8f af = half2float(a);
  Packet8f bf = half2float(b);
  Packet8f rf = psub(af, bf);
  return float2half(rf);
 }
 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
  Packet8f af = half2float(a);
  Packet8f bf = half2float(b);
@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
  return Eigen::half(reduced);
 }
 template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
  Packet8f pf[8];
  pf[0] = half2float(p[0]);
  pf[1] = half2float(p[1]);
  pf[2] = half2float(p[2]);
  pf[3] = half2float(p[3]);
  pf[4] = half2float(p[4]);
  pf[5] = half2float(p[5]);
  pf[6] = half2float(p[6]);
  pf[7] = half2float(p[7]);
  Packet8f reduced = preduxp<Packet8f>(pf);
  return float2half(reduced);
 }
 template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
 {
  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
  Packet8h res;
  res.x = _mm_shuffle_epi8(a.x,m);
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
 {
  Packet8h res;
  res.x = _mm_insert_epi16(a.x,int(b.x),0);
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
 {
  Packet8h res;
  res.x = _mm_insert_epi16(a.x,int(b.x),7);
  return res;
 }
 template<int Offset>
 struct palign_impl<Offset,Packet8h>
 {
  static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
  {
    if (Offset!=0)
      first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
  }
 };
 EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet8h,8>& kernel) {
  __m128i a = kernel.packet[0].x;
@ -1129,4 +1420,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
 }
 }
-#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
+#endif // EIGEN_PACKET_MATH_HALF_GPU_H
--- a/Eigen/src/Core/arch/CUDA/TypeCasting.h
+++ b/Eigen/src/Core/arch/CUDA/TypeCasting.h
@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#ifndef EIGEN_TYPE_CASTING_GPU_H
-#define EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_GPU_H
 namespace Eigen {
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
  typedef Eigen::half result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
      return __float2half(a);
    #else
      return Eigen::half(a);
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
  typedef Eigen::half result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
      return __float2half(static_cast<float>(a));
    #else
      return Eigen::half(static_cast<float>(a));
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
  typedef float result_type;
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
      return __half2float(a);
    #else
      return static_cast<float>(a);
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 template <>
 struct type_casting_traits<Eigen::half, float> {
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
 } // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_CUDA_H
+#endif // EIGEN_TYPE_CASTING_GPU_H
--- a/Eigen/src/Core/arch/HIP/hcc/math_constants.h
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@ -0,0 +1,23 @@
 /*
 * math_constants.h - 
 *  HIP equivalent of the CUDA header of the same name
 */
 #ifndef __MATH_CONSTANTS_H__
 #define __MATH_CONSTANTS_H__
 /* single precision constants */
 #define HIPRT_INF_F        __int_as_float(0x7f800000)
 #define HIPRT_NAN_F        __int_as_float(0x7fffffff)
 #define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
 #define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
 #define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
 #define HIPRT_ZERO_F       0.0f
 #define HIPRT_ONE_F        1.0f
 /* double precision constants */
 #define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
 #define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
 #endif
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@ -0,0 +1,759 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2018 Wave Computing, Inc.
 // Written by:
 //   Chris Larsen
 //   Alexey Frunze (afrunze@wavecomp.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_COMPLEX_MSA_H
 #define EIGEN_COMPLEX_MSA_H
 #include <iostream>
 namespace Eigen {
 namespace internal {
 //---------- float ----------
 struct Packet2cf {
  EIGEN_STRONG_INLINE Packet2cf() {
  }
  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
                                         const std::complex<float>& b) {
    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
    v = t;
  }
  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
  }
  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
  }
  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
    v = b.v;
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
  }
  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
    Packet4f v1, v2;
    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
    // Multiply the real a with b
    v1 = pmul(v1, b.v);
    // Multiply the imag a with b
    v2 = pmul(v2, b.v);
    // Conjugate v2
    v2 = Packet2cf(v2).conjugate().v;
    // Swap real/imag elements in v2.
    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
    // Add and return the result
    v = padd(v1, v2);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
    return Packet2cf(*this) *= b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
    v = padd(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
    return Packet2cf(*this) += b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
    v = psub(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
    return Packet2cf(*this) -= b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
    *this *= b.conjugate();
    Packet4f s = pmul<Packet4f>(b.v, b.v);
    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    v = pdiv(v, s);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
    return Packet2cf(*this) /= b;
  }
  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
    return Packet2cf(pnegate(v));
  }
  Packet4f v;
 };
 inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
  os << "[ (" << value.v[0] << ", " << value.v[1]
     << "i),"
        "  ("
     << value.v[2] << ", " << value.v[3] << "i) ]";
  return os;
 }
 template <>
 struct packet_traits<std::complex<float> > : default_packet_traits {
  typedef Packet2cf type;
  typedef Packet2cf half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 1,
    size = 2,
    HasHalfPacket = 0,
    HasAdd = 1,
    HasSub = 1,
    HasMul = 1,
    HasDiv = 1,
    HasNegate = 1,
    HasAbs = 0,
    HasAbs2 = 0,
    HasMin = 0,
    HasMax = 0,
    HasSetLinear = 0,
    HasBlend = 1
  };
 };
 template <>
 struct unpacket_traits<Packet2cf> {
  typedef std::complex<float> type;
  enum { size = 2, alignment = Aligned16 };
  typedef Packet2cf half;
 };
 template <>
 EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
  EIGEN_MSA_DEBUG;
  float f0 = from.real(), f1 = from.imag();
  Packet4f v0 = { f0, f0, f0, f0 };
  Packet4f v1 = { f1, f1, f1, f1 };
  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return a + b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return a - b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return -a;
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return a.conjugate();
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return a * b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return Packet2cf(pand(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return Packet2cf(por(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return Packet2cf(pxor(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return Packet2cf(pandnot(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
  EIGEN_MSA_DEBUG;
  return pset1<Packet2cf>(*from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
                                                      const Packet2cf& from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
                                                       const Packet2cf& from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
 }
 template <>
 EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
    const std::complex<float>* from, Index stride) {
  EIGEN_MSA_DEBUG;
  return Packet2cf(from[0 * stride], from[1 * stride]);
 }
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
                                                                       const Packet2cf& from,
                                                                       Index stride) {
  EIGEN_MSA_DEBUG;
  *to = std::complex<float>(from.v[0], from.v[1]);
  to += stride;
  *to = std::complex<float>(from.v[2], from.v[3]);
 }
 template <>
 EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
  EIGEN_MSA_DEBUG;
  prefetch(reinterpret_cast<const float*>(addr));
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return std::complex<float>(a.v[0], a.v[1]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
  value += a.v;
  return std::complex<float>(value[0], value[1]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
  EIGEN_MSA_DEBUG;
  Packet4f sum1, sum2, sum;
  // Add the first two 64-bit float32x2_t of vecs[0]
  sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
  sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
  sum = padd(sum1, sum2);
  return Packet2cf(sum);
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
  EIGEN_MSA_DEBUG;
  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
 }
 template <int Offset>
 struct palign_impl<Offset, Packet2cf> {
  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
    if (Offset == 1) {
      first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
    }
  }
 };
 template <>
 struct conj_helper<Packet2cf, Packet2cf, false, true> {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
                                      const Packet2cf& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
    return internal::pmul(a, pconj(b));
  }
 };
 template <>
 struct conj_helper<Packet2cf, Packet2cf, true, false> {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
                                      const Packet2cf& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
    return internal::pmul(pconj(a), b);
  }
 };
 template <>
 struct conj_helper<Packet2cf, Packet2cf, true, true> {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
                                      const Packet2cf& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 template <>
 EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
  EIGEN_MSA_DEBUG;
  return a / b;
 }
 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
  return os;
 }
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
  EIGEN_MSA_DEBUG;
  Packet4f tmp =
      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
  kernel.packet[0].v =
      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
  kernel.packet[1].v = tmp;
 }
 template <>
 EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
                                     const Packet2cf& elsePacket) {
  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
                                               (Packet2d)elsePacket.v);
 }
 //---------- double ----------
 struct Packet1cd {
  EIGEN_STRONG_INLINE Packet1cd() {
  }
  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
    v[0] = std::real(a);
    v[1] = std::imag(a);
  }
  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
  }
  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
  }
  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
    v = b.v;
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
  }
  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
    Packet2d v1, v2;
    // Get the real values of a | a1_re | a1_re
    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
    // Get the imag values of a | a1_im | a1_im
    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
    // Multiply the real a with b
    v1 = pmul(v1, b.v);
    // Multiply the imag a with b
    v2 = pmul(v2, b.v);
    // Conjugate v2
    v2 = Packet1cd(v2).conjugate().v;
    // Swap real/imag elements in v2.
    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
    // Add and return the result
    v = padd(v1, v2);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
    return Packet1cd(*this) *= b;
  }
  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
    v = padd(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
    return Packet1cd(*this) += b;
  }
  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
    v = psub(v, b.v);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
    return Packet1cd(*this) -= b;
  }
  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
    *this *= b.conjugate();
    Packet2d s = pmul<Packet2d>(b.v, b.v);
    s = padd(s, preverse<Packet2d>(s));
    v = pdiv(v, s);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
    return Packet1cd(*this) /= b;
  }
  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
    return Packet1cd(pnegate(v));
  }
  Packet2d v;
 };
 inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
  return os;
 }
 template <>
 struct packet_traits<std::complex<double> > : default_packet_traits {
  typedef Packet1cd type;
  typedef Packet1cd half;
  enum {
    Vectorizable = 1,
    AlignedOnScalar = 0,
    size = 1,
    HasHalfPacket = 0,
    HasAdd = 1,
    HasSub = 1,
    HasMul = 1,
    HasDiv = 1,
    HasNegate = 1,
    HasAbs = 0,
    HasAbs2 = 0,
    HasMin = 0,
    HasMax = 0,
    HasSetLinear = 0
  };
 };
 template <>
 struct unpacket_traits<Packet1cd> {
  typedef std::complex<double> type;
  enum { size = 1, alignment = Aligned16 };
  typedef Packet1cd half;
 };
 template <>
 EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(from);
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return a + b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return a - b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return -a;
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return a.conjugate();
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return a * b;
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(pand(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(por(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(pxor(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(pandnot(a.v, b.v));
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
  EIGEN_MSA_DEBUG;
  return pset1<Packet1cd>(*from);
 }
 template <>
 EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
                                                       const Packet1cd& from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
                                                        const Packet1cd& from) {
  EIGEN_MSA_DEBUG;
  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
 }
 template <>
 EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
  EIGEN_MSA_DEBUG;
  prefetch(reinterpret_cast<const double*>(addr));
 }
 template <>
 EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
    const std::complex<double>* from, Index stride __attribute__((unused))) {
  EIGEN_MSA_DEBUG;
  Packet1cd res;
  res.v[0] = std::real(from[0]);
  res.v[1] = std::imag(from[0]);
  return res;
 }
 template <>
 EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
                                                                        const Packet1cd& from,
                                                                        Index stride
                                                                        __attribute__((unused))) {
  EIGEN_MSA_DEBUG;
  pstore(to, from);
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return std::complex<double>(a.v[0], a.v[1]);
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return a;
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return pfirst(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
  EIGEN_MSA_DEBUG;
  return vecs[0];
 }
 template <>
 EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
  EIGEN_MSA_DEBUG;
  return pfirst(a);
 }
 template <int Offset>
 struct palign_impl<Offset, Packet1cd> {
  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
    // FIXME is it sure we never have to align a Packet1cd?
    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
    // boundary...
  }
 };
 template <>
 struct conj_helper<Packet1cd, Packet1cd, false, true> {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
                                      const Packet1cd& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
    return internal::pmul(a, pconj(b));
  }
 };
 template <>
 struct conj_helper<Packet1cd, Packet1cd, true, false> {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
                                      const Packet1cd& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
    return internal::pmul(pconj(a), b);
  }
 };
 template <>
 struct conj_helper<Packet1cd, Packet1cd, true, true> {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
                                      const Packet1cd& c) const {
    return padd(pmul(x, y), c);
  }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 template <>
 EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
  EIGEN_MSA_DEBUG;
  return a / b;
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
  EIGEN_MSA_DEBUG;
  return Packet1cd(preverse(Packet2d(x.v)));
 }
 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
  return os;
 }
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
  EIGEN_MSA_DEBUG;
  Packet2d v1, v2;
  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
  // Get the imag values of a
  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
  kernel.packet[0].v = v1;
  kernel.packet[1].v = v2;
 }
 }  // end namespace internal
 }  // end namespace Eigen
 #endif  // EIGEN_COMPLEX_MSA_H
--- a/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@ -0,0 +1,387 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2007 Julien Pommier
 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
 // Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // Copyright (C) 2018 Wave Computing, Inc.
 // Written by:
 //   Chris Larsen
 //   Alexey Frunze (afrunze@wavecomp.com)
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /* The sin, cos, exp, and log functions of this file come from
 * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
 */
 /* The tanh function of this file is an adaptation of
 * template<typename T> T generic_fast_tanh_float(const T&)
 * from MathFunctionsImpl.h.
 */
 #ifndef EIGEN_MATH_FUNCTIONS_MSA_H
 #define EIGEN_MATH_FUNCTIONS_MSA_H
 namespace Eigen {
 namespace internal {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 plog<Packet4f>(const Packet4f& _x) {
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
  // Convert negative argument into NAN (quiet negative, to be specific).
  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
  Packet4f x = non_neg_x_or_nan;
  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
  // N.B. the exponent is one less of what frexpf() would return.
  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
  /*
     if (x < SQRTHF) {
       x = x + x - 1.0;
     } else {
       e += 1;
       x = x - 1.0;
     }
  */
  Packet4f xx = padd(x, x);
  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
  e_int = psub(e_int, ge_mask);
  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
  x = psub(x, p4f_1);
  Packet4f e = __builtin_msa_ffint_s_w(e_int);
  Packet4f x2 = pmul(x, x);
  Packet4f x3 = pmul(x2, x);
  Packet4f y, y1, y2;
  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
  y = pmadd(y, x, p4f_cephes_log_p2);
  y1 = pmadd(y1, x, p4f_cephes_log_p5);
  y2 = pmadd(y2, x, p4f_cephes_log_p8);
  y = pmadd(y, x3, y1);
  y = pmadd(y, x3, y2);
  y = pmul(y, x3);
  y = pmadd(e, p4f_cephes_log_q1, y);
  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
  x = padd(x, y);
  x = pmadd(e, p4f_cephes_log_q2, x);
  // x is now the logarithm result candidate. We still need to handle the
  // extreme arguments of zero and positive infinity, though.
  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
  // contain infinities of both signs (see the coefficients and code above).
  // INFINITY - INFINITY is NAN.
  // If the argument is +INFINITY, make it the new result candidate.
  // To achieve that we choose the smaller of the result candidate and the
  // argument.
  // This is correct for all finite pairs of values (the logarithm is smaller
  // than the argument).
  // This is also correct in the special case when the argument is +INFINITY
  // and the result candidate is NAN. This is because the fmin.df instruction
  // prefers non-NANs to NANs.
  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
  // If the argument is zero (including -0.0), the result becomes -INFINITY.
  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
  return x;
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 pexp<Packet4f>(const Packet4f& _x) {
  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
  // reach 0 and INFINITY naturally.
  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
  Packet4f x = _x;
  // Clamp x.
  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
                                     (v16u8)p4f_exp_lo);
  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
                                     (v16u8)p4f_exp_hi);
  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
  Packet4f z = pmul(x, x);
  Packet4f y = p4f_cephes_exp_p0;
  y = pmadd(y, x, p4f_cephes_exp_p1);
  y = pmadd(y, x, p4f_cephes_exp_p2);
  y = pmadd(y, x, p4f_cephes_exp_p3);
  y = pmadd(y, x, p4f_cephes_exp_p4);
  y = pmadd(y, x, p4f_cephes_exp_p5);
  y = pmadd(y, z, x);
  y = padd(y, p4f_1);
  // y *= 2**exponent.
  y = __builtin_msa_fexp2_w(y, x2_int);
  return y;
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 ptanh<Packet4f>(const Packet4f& _x) {
  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
  // The monomial coefficients of the numerator polynomial (odd).
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
  // The monomial coefficients of the denominator polynomial (even).
  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
  Packet4f x = pabs(_x);
  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
  // Clamp the inputs to the range [-9, 9] since anything outside
  // this range is -/+1.0f in single-precision.
  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
                                     (v16u8)p4f_tanh_hi);
  // Since the polynomials are odd/even, we need x**2.
  Packet4f x2 = pmul(x, x);
  // Evaluate the numerator polynomial p.
  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
  p = pmadd(x2, p, p4f_alpha_9);
  p = pmadd(x2, p, p4f_alpha_7);
  p = pmadd(x2, p, p4f_alpha_5);
  p = pmadd(x2, p, p4f_alpha_3);
  p = pmadd(x2, p, p4f_alpha_1);
  p = pmul(x, p);
  // Evaluate the denominator polynomial q.
  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
  q = pmadd(x2, q, p4f_beta_2);
  q = pmadd(x2, q, p4f_beta_0);
  // Divide the numerator by the denominator.
  p = pdiv(p, q);
  // Reinstate the sign.
  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
  // When the argument is very small in magnitude it's more accurate to just return it.
  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
  return p;
 }
 template <bool sine>
 Packet4f psincos_inner_msa_float(const Packet4f& _x) {
  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
  Packet4f x = pabs(_x);
  // Translate infinite arguments into NANs.
  Packet4f zero_or_nan_if_inf = psub(_x, _x);
  x = padd(x, zero_or_nan_if_inf);
  // Prevent sin/cos from generating values larger than 1.0 in magnitude
  // for very large arguments by setting x to 0.0.
  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
  x = pand(x, (Packet4f)small_or_nan_mask);
  // Scale x by 4/Pi to find x's octant.
  Packet4f y = pmul(x, p4f_cephes_FOPI);
  // Get the octant. We'll reduce x by this number of octants or by one more than it.
  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
  y = __builtin_msa_ffint_s_w(y_int2);
  // Compute the sign to apply to the polynomial.
  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
  // Get the polynomial selection mask.
  // We'll calculate both (sin and cos) polynomials and then select from the two.
  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
  // The magic pass: "Extended precision modular arithmetic"
  // x = ((x - y * DP1) - y * DP2) - y * DP3
  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
  x = padd(x, tmp1);
  x = padd(x, tmp2);
  x = padd(x, tmp3);
  // Evaluate the cos(x) polynomial.
  y = p4f_coscof_p0;
  Packet4f z = pmul(x, x);
  y = pmadd(y, z, p4f_coscof_p1);
  y = pmadd(y, z, p4f_coscof_p2);
  y = pmul(y, z);
  y = pmul(y, z);
  y = __builtin_msa_fmsub_w(y, z, p4f_half);
  y = padd(y, p4f_1);
  // Evaluate the sin(x) polynomial.
  Packet4f y2 = p4f_sincof_p0;
  y2 = pmadd(y2, z, p4f_sincof_p1);
  y2 = pmadd(y2, z, p4f_sincof_p2);
  y2 = pmul(y2, z);
  y2 = pmadd(y2, x, x);
  // Select the correct result from the two polynomials.
  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
  // Update the sign.
  sign_mask = pxor(sign_mask, (Packet4i)y);
  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
  return y;
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 psin<Packet4f>(const Packet4f& x) {
  return psincos_inner_msa_float</* sine */ true>(x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
 pcos<Packet4f>(const Packet4f& x) {
  return psincos_inner_msa_float</* sine */ false>(x);
 }
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
 pexp<Packet2d>(const Packet2d& _x) {
  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
  // reach 0 and INFINITY naturally.
  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
  Packet2d x = _x;
  // Clamp x.
  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
                                     (v16u8)p2d_exp_lo);
  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
                                     (v16u8)p2d_exp_hi);
  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
  x2 = pmul(x, x);
  Packet2d px = p2d_cephes_exp_p0;
  px = pmadd(px, x2, p2d_cephes_exp_p1);
  px = pmadd(px, x2, p2d_cephes_exp_p2);
  px = pmul(px, x);
  Packet2d qx = p2d_cephes_exp_q0;
  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
  x = pdiv(px, psub(qx, px));
  x = pmadd(p2d_2, x, p2d_1);
  // x *= 2**exponent.
  x = __builtin_msa_fexp2_d(x, x2_long);
  return x;
 }
 }  // end namespace internal
 }  // end namespace Eigen
 #endif  // EIGEN_MATH_FUNCTIONS_MSA_H
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
  float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+  r64 = vld1_f32((const float *)&from);
  return Packet2cf(vcombine_f32(r64, r64));
 }
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  // TODO optimize it for NEON
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
  s = vmulq_f32(b.v, b.v);
  rev_s = vrev64q_f32(s);
-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
 }
 EIGEN_DEVICE_FUNC inline void
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
  // TODO optimize it for NEON
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
  return y;
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
  Packet4f x = _x;
  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
  _EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
  /* natural logarithm computed for 4 simultaneous float
    return NaN for x <= 0
  */
  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
  Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
  Packet4i ux = vreinterpretq_s32_f32(x);
  Packet4i emm0 = vshrq_n_s32(ux, 23);
  /* keep only the fractional part */
  ux = vandq_s32(ux, p4i_inv_mant_mask);
  ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
  x = vreinterpretq_f32_s32(ux);
  emm0 = vsubq_s32(emm0, p4i_0x7f);
  Packet4f e = vcvtq_f32_s32(emm0);
  e = vaddq_f32(e, p4f_1);
  /* part2:
     if( x < SQRTHF ) {
       e -= 1;
       x = x + x - 1.0;
     } else { x = x - 1.0; }
  */
  Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
  Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
  x = vsubq_f32(x, p4f_1);
  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
  x = vaddq_f32(x, tmp);
  Packet4f z = vmulq_f32(x,x);
  Packet4f y = p4f_cephes_log_p0;
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p1);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p2);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p3);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p4);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p5);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p6);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p7);
  y = vmulq_f32(y, x);
  y = vaddq_f32(y, p4f_cephes_log_p8);
  y = vmulq_f32(y, x);
  y = vmulq_f32(y, z);
  tmp = vmulq_f32(e, p4f_cephes_log_q1);
  y = vaddq_f32(y, tmp);
  tmp = vmulq_f32(z, p4f_half);
  y = vsubq_f32(y, tmp);
  tmp = vmulq_f32(e, p4f_cephes_log_q2);
  x = vaddq_f32(x, y);
  x = vaddq_f32(x, tmp);
  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
  return x;
 }
 } // end namespace internal
 } // end namespace Eigen
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@ -36,12 +36,43 @@ namespace internal {
 #endif
 #endif
 #if EIGEN_COMP_MSVC
 // In MSVC's arm_neon.h header file, all NEON vector types
 // are aliases to the same underlying type __n128.
 // We thus have to wrap them to make them different C++ types.
 // (See also bug 1428)
 template<typename T,int unique_id>
 struct eigen_packet_wrapper
 {
  operator T&() { return m_val; }
  operator const T&() const { return m_val; }
  eigen_packet_wrapper() {}
  eigen_packet_wrapper(const T &v) : m_val(v) {}
  eigen_packet_wrapper& operator=(const T &v) {
    m_val = v;
    return *this;
  }
  T m_val;
 };
 typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
 typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
 typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
 typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
 typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
 #else
 typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
 typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;
 #endif // EIGEN_COMP_MSVC
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@ -51,14 +82,17 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
+#if EIGEN_ARCH_ARM64
-// which available on LLVM and GCC (at least)
+  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  // prefetch instructions there are too detailed for __builtin_prefetch to map
  // meaningfully to them.
  #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !EIGEN_ARCH_ARM64
+#elif EIGEN_ARCH_ARM32
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
  // by default no explicit prefetching
  #define EIGEN_ARM_PREFETCH(ADDR)
@ -78,7 +112,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    // FIXME check the Has*
    HasSin  = 0,
    HasCos  = 0,
-    HasLog  = 0,
+    HasLog  = 1,
    HasExp  = 1,
    HasSqrt = 0
  };
@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  const float32_t f[] = {0, 1, 2, 3};
+  const float f[] = {0, 1, 2, 3};
  Packet4f countdown = vld1q_f32(f);
  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@ -0,0 +1,48 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_TYPE_CASTING_NEON_H
 #define EIGEN_TYPE_CASTING_NEON_H
 namespace Eigen {
 namespace internal {
 template <>
 struct type_casting_traits<float, int> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template <>
 struct type_casting_traits<int, float> {
  enum {
    VectorizedCast = 1,
    SrcCoeffRatio = 1,
    TgtCoeffRatio = 1
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
  return vcvtq_s32_f32(a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
  return vcvtq_f32_s32(a);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_TYPE_CASTING_NEON_H
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
  }
 };
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
 };
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
 {
  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
  }
 };
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
 };
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
 {
  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
  { return padd(c, pmul(x,y)); }
  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
 };
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+/* evaluation of 4 sines at once, using SSE2 intrinsics.
   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 #if EIGEN_COMP_PGI
 typedef const void * SsePrefetchPtrType;
 #else
 typedef const char * SsePrefetchPtrType;
 #endif
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
  // TODO try to call _mm_mul_epu32 directly
  EIGEN_ALIGN16 int aux[4];
  pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
 }
 // min
@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
 } // end namespace Eigen
 #if EIGEN_COMP_PGI
 // PGI++ does not define the following intrinsics in C++ mode.
 static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
 static inline __m128d _mm_castps_pd   (__m128  x) { return reinterpret_cast<__m128d&>(x); }
 static inline __m128i _mm_castps_si128(__m128  x) { return reinterpret_cast<__m128i&>(x); }
 static inline __m128  _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x);  }
 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
 #endif
 #endif // EIGEN_PACKET_MATH_SSE_H
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@ -14,6 +14,7 @@ namespace Eigen {
 namespace internal {
 #ifndef EIGEN_VECTORIZE_AVX
 template <>
 struct type_casting_traits<float, int> {
  enum {
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
  return _mm_cvttps_epi32(a);
 }
 template <>
 struct type_casting_traits<int, float> {
  enum {
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
  return _mm_cvtepi32_ps(a);
 }
 template <>
 struct type_casting_traits<double, float> {
  enum {
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
  };
 };
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
 }
 template <>
 struct type_casting_traits<float, double> {
  enum {
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
    TgtCoeffRatio = 2
  };
 };
 #endif
 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
  return _mm_cvttps_epi32(a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
  return _mm_cvtepi32_ps(a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
 }
 template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
  // Simply discard the second half of the input
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@ -0,0 +1,104 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * InteropHeaders.h
 *
 * \brief:
 *  InteropHeaders
 *
 *****************************************************************/
 #ifndef EIGEN_INTEROP_HEADERS_SYCL_H
 #define EIGEN_INTEROP_HEADERS_SYCL_H
 #if defined EIGEN_USE_SYCL
 namespace Eigen {
 namespace internal {
 #define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
  template<> struct packet_traits<unpacket_type> : default_packet_traits\
  {\
    typedef packet_type type;\
    typedef packet_type half;\
    enum {\
      Vectorizable = 1,\
      AlignedOnScalar = 1,\
      size=lengths,\
      HasHalfPacket = 0,\
      HasDiv  = 1,\
      HasLog  = 1,\
      HasExp  = 1,\
      HasSqrt = 1,\
      HasRsqrt = 1,\
      HasSin    = 1,\
      HasCos    = 1,\
      HasTan    = 1,\
      HasASin   = 1,\
      HasACos   = 1,\
      HasATan   = 1,\
      HasSinh   = 1,\
      HasCosh   = 1,\
      HasTanh   = 1,\
      HasLGamma = 0,\
      HasDiGamma = 0,\
      HasZeta = 0,\
      HasPolygamma = 0,\
      HasErf = 0,\
      HasErfc = 0,\
      HasIGamma = 0,\
      HasIGammac = 0,\
      HasBetaInc = 0,\
      HasBlend = val,\
      HasMax=1,\
      HasMin=1,\
      HasMul=1,\
      HasAdd=1,\
      HasFloor=1,\
      HasRound=1,\
      HasLog1p=1,\
      HasExpm1=1,\
      HasCeil=1,\
    };\
  };
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
 SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
 #undef SYCL_PACKET_TRAITS
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type>  { enum { value = true }; };
 SYCL_ARITHMETIC(cl::sycl::cl_float4)
 SYCL_ARITHMETIC(cl::sycl::cl_double2)
 #undef SYCL_ARITHMETIC
 #define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
 template<> struct unpacket_traits<packet_type>  {\
  typedef unpacket_type  type;\
  enum {size=lengths, alignment=Aligned16};\
  typedef packet_type half;\
 };
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
 SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
 #undef SYCL_UNPACKET_TRAITS
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_USE_SYCL
 #endif // EIGEN_INTEROP_HEADERS_SYCL_H
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@ -0,0 +1,221 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * MathFunctions.h
 *
 * \brief:
 *  MathFunctions
 *
 *****************************************************************/
 #ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
 #define EIGEN_MATH_FUNCTIONS_SYCL_H
 namespace Eigen {
 namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 //#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
 #define SYCL_PLOG(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
 SYCL_PLOG(cl::sycl::cl_float4)
 SYCL_PLOG(cl::sycl::cl_double2)
 #undef SYCL_PLOG
 #define SYCL_PLOG1P(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
 SYCL_PLOG1P(cl::sycl::cl_float4)
 SYCL_PLOG1P(cl::sycl::cl_double2)
 #undef SYCL_PLOG1P
 #define SYCL_PLOG10(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
 SYCL_PLOG10(cl::sycl::cl_float4)
 SYCL_PLOG10(cl::sycl::cl_double2)
 #undef SYCL_PLOG10
 #define SYCL_PEXP(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
 SYCL_PEXP(cl::sycl::cl_float4)
 SYCL_PEXP(cl::sycl::cl_double2)
 #undef SYCL_PEXP
 #define SYCL_PEXPM1(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
 SYCL_PEXPM1(cl::sycl::cl_float4)
 SYCL_PEXPM1(cl::sycl::cl_double2)
 #undef SYCL_PEXPM1
 #define SYCL_PSQRT(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
 SYCL_PSQRT(cl::sycl::cl_float4)
 SYCL_PSQRT(cl::sycl::cl_double2)
 #undef SYCL_PSQRT
 #define SYCL_PRSQRT(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
 SYCL_PRSQRT(cl::sycl::cl_float4)
 SYCL_PRSQRT(cl::sycl::cl_double2)
 #undef SYCL_PRSQRT
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PSIN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psin<packet_type>(const packet_type& a)  { return cl::sycl::sin(a); }
 SYCL_PSIN(cl::sycl::cl_float4)
 SYCL_PSIN(cl::sycl::cl_double2)
 #undef SYCL_PSIN
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PCOS(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pcos<packet_type>(const packet_type& a)  { return cl::sycl::cos(a); }
 SYCL_PCOS(cl::sycl::cl_float4)
 SYCL_PCOS(cl::sycl::cl_double2)
 #undef SYCL_PCOS
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PTAN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type ptan<packet_type>(const packet_type& a) {  return cl::sycl::tan(a); }
 SYCL_PTAN(cl::sycl::cl_float4)
 SYCL_PTAN(cl::sycl::cl_double2)
 #undef SYCL_PTAN
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PASIN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pasin<packet_type>(const packet_type& a)  { return cl::sycl::asin(a); }
 SYCL_PASIN(cl::sycl::cl_float4)
 SYCL_PASIN(cl::sycl::cl_double2)
 #undef SYCL_PASIN
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PACOS(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pacos<packet_type>(const packet_type& a)  { return cl::sycl::acos(a); }
 SYCL_PACOS(cl::sycl::cl_float4)
 SYCL_PACOS(cl::sycl::cl_double2)
 #undef SYCL_PACOS
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PATAN(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type patan<packet_type>(const packet_type& a) {  return cl::sycl::atan(a); }
 SYCL_PATAN(cl::sycl::cl_float4)
 SYCL_PATAN(cl::sycl::cl_double2)
 #undef SYCL_PATAN
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 #define SYCL_PSINH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type psinh<packet_type>(const packet_type& a)  { return cl::sycl::sinh(a); }
 SYCL_PSINH(cl::sycl::cl_float4)
 SYCL_PSINH(cl::sycl::cl_double2)
 #undef SYCL_PSINH
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 #define SYCL_PCOSH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pcosh<packet_type>(const packet_type& a)  { return cl::sycl::cosh(a); }
 SYCL_PCOSH(cl::sycl::cl_float4)
 SYCL_PCOSH(cl::sycl::cl_double2)
 #undef SYCL_PCOSH
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 #define SYCL_PTANH(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type ptanh<packet_type>(const packet_type& a) {  return cl::sycl::tanh(a); }
 SYCL_PTANH(cl::sycl::cl_float4)
 SYCL_PTANH(cl::sycl::cl_double2)
 #undef SYCL_PTANH
 #define SYCL_PCEIL(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
 SYCL_PCEIL(cl::sycl::cl_float4)
 SYCL_PCEIL(cl::sycl::cl_double2)
 #undef SYCL_PCEIL
 #define SYCL_PROUND(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
 SYCL_PROUND(cl::sycl::cl_float4)
 SYCL_PROUND(cl::sycl::cl_double2)
 #undef SYCL_PROUND
 #define SYCL_FLOOR(packet_type) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
 SYCL_FLOOR(cl::sycl::cl_float4)
 SYCL_FLOOR(cl::sycl::cl_double2)
 #undef SYCL_FLOOR
 #define SYCL_PMIN(packet_type, expr) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
 SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
 SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
 #undef SYCL_PMIN
 #define SYCL_PMAX(packet_type, expr) \
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
 packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
 SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
 SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
 #undef SYCL_PMAX
 //#endif
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_MATH_FUNCTIONS_CUDA_H
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@ -0,0 +1,458 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 /*****************************************************************
 * PacketMath.h
 *
 * \brief:
 *  PacketMath
 *
 *****************************************************************/
 #ifndef EIGEN_PACKET_MATH_SYCL_H
 #define EIGEN_PACKET_MATH_SYCL_H
 #include <type_traits>
 #if defined EIGEN_USE_SYCL
 namespace Eigen {
 namespace internal {
 #define SYCL_PLOADT_RO(address_space_target)\
 template<typename packet_type, int Alignment>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
 ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
   typedef typename unpacket_traits<packet_type>::type scalar;\
   typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
   auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
   res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
   return res;\
 }
 SYCL_PLOADT_RO(global_space)
 SYCL_PLOADT_RO(local_space)
 #undef SYCL_PLOADT_RO
 #define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
 template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
 pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
   cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
   return ploadt_ro<packet_type, Alignment>(from);\
 }
 // global space
 SYCL_PLOAD(global_space, Unaligned, u)
 SYCL_PLOAD(global_space, Aligned, )
 // local space
 SYCL_PLOAD(local_space, Unaligned, u)
 SYCL_PLOAD(local_space, Aligned, )
 // private space
 //SYCL_PLOAD(private_space, Unaligned, u)
 //SYCL_PLOAD(private_space, Aligned, )
 #undef SYCL_PLOAD
 /** \internal \returns a packet version of \a *from.
  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
 #define SYCL_PLOADT(address_space_target)\
 template<typename packet_type, int Alignment>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
 {\
  if(Alignment >= unpacket_traits<packet_type>::alignment)\
    return pload<packet_type>(from);\
  else\
    return ploadu<packet_type>(from);\
 }
 // global space
 SYCL_PLOADT(global_space)
 // local space
 SYCL_PLOADT(local_space)
 //private_space
 // There is no need to specialise it for private space as it can use the GenericPacketMath version
 #define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
  ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
    typedef typename unpacket_traits<packet_type>::type scalar;\
   auto res=packet_type(static_cast<scalar>(0));\
   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
   return res;\
  }
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
 SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
 #define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
  pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
    typedef typename unpacket_traits<packet_type>::type scalar;\
   auto res=packet_type(static_cast<scalar>(0));\
   res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
   return res;\
  }
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
 SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
 #undef SYCL_PLOAD_SPECIAL
 #define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
 template<>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
   typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
   const packet_type& from) {\
     typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
     from.store(0, multi_ptr(to));\
 }
 // global space
 SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
 SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
 SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
 SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
 SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
 SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
 #define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
 template<>\
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
  scalar* to,\
  const packet_type& from) {\
  if(Alignment)\
  pstore(to, from);\
  else\
  pstoreu(to,from);\
 }
 SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
 SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
 SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
 SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
 #undef SYCL_PSTORE_T
 #define SYCL_PSET1(packet_type)\
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
  const typename unpacket_traits<packet_type>::type&  from) {\
  return packet_type(from);\
 }
 // global space
 SYCL_PSET1(cl::sycl::cl_float4)
 SYCL_PSET1(cl::sycl::cl_double2)
 #undef SYCL_PSET1
 template <typename packet_type> struct get_base_packet {
 template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
  template <typename sycl_multi_pointer>
    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
 };
 template <> struct get_base_packet <cl::sycl::cl_float4>  {
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
    return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
    auto tmp = stride;
    to[0] = from.x();
    to[tmp] = from.y();
    to[tmp += stride] = from.z();
    to[tmp += stride] = from.w();
 }
 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
   return  cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
 }
 };
 template <> struct get_base_packet <cl::sycl::cl_double2>  {
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
    return cl::sycl::cl_double2(from[0], from[0]);
  }
  template <typename sycl_multi_pointer, typename Index>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
    return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
  }
  template <typename sycl_multi_pointer>
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
    to[0] = from.x();
    to[stride] = from.y();
  }
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
    return  cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
  }
 };
 #define SYCL_PLOAD_DUP(address_space_target)\
 template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from)\
 {\
  return get_base_packet<packet_type>::get_ploaddup(from); \
 }
 // global space
 SYCL_PLOAD_DUP(global_space)
 // local_space
 SYCL_PLOAD_DUP(local_space)
 // private_space
 //SYCL_PLOAD_DUP(private_space)
 #undef SYCL_PLOAD_DUP
 #define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
 { \
  return get_base_packet<packet_type>::get_ploaddup(from); \
 }
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
 SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
 #undef SYCL_PLOAD_DUP_SPECILIZE
 #define SYCL_PLSET(packet_type)\
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
  return get_base_packet<packet_type>::set_plset(a);\
 }
 SYCL_PLSET(cl::sycl::cl_float4)
 SYCL_PLSET(cl::sycl::cl_double2)
 #undef SYCL_PLSET
 #define SYCL_PGATHER(address_space_target)\
 template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
  typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
  cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
    return get_base_packet<packet_type>::get_pgather(from, stride); \
 }
 // global space
 SYCL_PGATHER(global_space)
 // local space
 SYCL_PGATHER(local_space)
 // private space
 //SYCL_PGATHER(private_space)
 #undef SYCL_PGATHER
 #define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
 pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
 { \
  return get_base_packet<packet_type>::get_pgather(from, stride); \
 }
 SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PGATHER_SPECILIZE
 #define SYCL_PSCATTER(address_space_target)\
 template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
    typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
    cl::sycl::access::address_space::address_space_target>::pointer_t to,\
      const packet_type& from, Index stride) {\
      get_base_packet<packet_type>::set_pscatter(to, from, stride);\
 }
 // global space
 SYCL_PSCATTER(global_space)
 // local space
 SYCL_PSCATTER(local_space)
 // private space
 //SYCL_PSCATTER(private_space)
 #undef SYCL_PSCATTER
 #define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
 pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
 { \
   get_base_packet<packet_type>::set_pscatter(to, from, stride);\
 }
 SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
 SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
 #undef SYCL_PSCATTER_SPECILIZE
 #define SYCL_PMAD(packet_type)\
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
  const packet_type& b, const packet_type& c){\
  return cl::sycl::mad(a,b,c);\
 }
 SYCL_PMAD(cl::sycl::cl_float4)
 SYCL_PMAD(cl::sycl::cl_double2)
 #undef SYCL_PMAD
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x() + a.y() + a.z() + a.w();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x() + a.y();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::fmax(a.x(), a.y());
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::fmin(a.x(), a.y());
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float  predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return a.x() * a.y() * a.z() * a.w();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return a.x() * a.y();
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4  pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
 }
 template<>  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
 }
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
 ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
  float tmp = kernel.packet[0].y();
  kernel.packet[0].y() = kernel.packet[1].x();
  kernel.packet[1].x() = tmp;
 //  std::swap(kernel.packet[0].y(), kernel.packet[1].x());
  tmp = kernel.packet[0].z();
  kernel.packet[0].z() = kernel.packet[2].x();
  kernel.packet[2].x() = tmp;
  //std::swap(kernel.packet[0].z(), kernel.packet[2].x());
  tmp = kernel.packet[0].w();
  kernel.packet[0].w() = kernel.packet[3].x();
  kernel.packet[3].x() = tmp;
  //std::swap(kernel.packet[0].w(), kernel.packet[3].x());
  tmp = kernel.packet[1].z();
  kernel.packet[1].z() = kernel.packet[2].y();
  kernel.packet[2].y() = tmp;
 //  std::swap(kernel.packet[1].z(), kernel.packet[2].y());
  tmp = kernel.packet[1].w();
  kernel.packet[1].w() = kernel.packet[3].y();
  kernel.packet[3].y() = tmp;
 //  std::swap(kernel.packet[1].w(), kernel.packet[3].y());
  tmp = kernel.packet[2].w();
  kernel.packet[2].w() = kernel.packet[3].z();
  kernel.packet[3].z() = tmp;
 //  std::swap(kernel.packet[2].w(), kernel.packet[3].z());
 }
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
 ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
  double tmp = kernel.packet[0].y();
  kernel.packet[0].y() = kernel.packet[1].x();
  kernel.packet[1].x() = tmp;
 //std::swap(kernel.packet[0].y(), kernel.packet[1].x());
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
 pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
  const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
                              ifPacket.select[1] ? 0 : -1,
                              ifPacket.select[2] ? 0 : -1,
                              ifPacket.select[3] ? 0 : -1);
  return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 template<> inline cl::sycl::cl_double2
 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
  const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
                               ifPacket.select[1] ? 0 : -1);
  return cl::sycl::select(thenPacket, elsePacket, condition);
 }
 } // end namespace internal
 } // end namespace Eigen
 #endif // EIGEN_USE_SYCL
 #endif // EIGEN_PACKET_MATH_SYCL_H
--- a/Show More
+++ b/Show More